In [1]:
class Args(object):
    def __init__(self, model_name, train_path, dev_path, test_path, add_eos_token_to_data, margin, max_len,
                number_of_gpu, batch_size_per_gpu, gradient_accumulation_steps, effective_batch_size, total_steps, 
                 print_every, save_every, learning_rate, save_path_prefix):
        self.model_name = model_name
        self.train_path = train_path
        self.dev_path = dev_path
        self.test_path = test_path
        self.add_eos_token_to_data = add_eos_token_to_data
        self.margin = margin
        self.max_len = max_len  
        self.number_of_gpu = number_of_gpu
        self.batch_size_per_gpu = batch_size_per_gpu
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.effective_batch_size = effective_batch_size
        self.total_steps = total_steps
        self.print_every = print_every
        self.save_every = save_every
        self.learning_rate = learning_rate
        self.save_path_prefix = save_path_prefix
        self.max_grad_norm = 1.0
args = Args('gpt2', "/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_train.json",
           "/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_val.json", 
           "/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_test.json",
           'True', 0.5, 64, 1, 32, 4, 128, 10000, 50, 250, 2e-5, "/kaggle/working/")

In [2]:
#dataclass.py
import json
import random
import torch
import numpy as np
import progressbar
from torch.nn.utils import rnn

class Data:
    def __init__(self, model_name, train_path, dev_path, test_path, max_len, 
        sos_token, pad_token, add_eos_token_to_data):
        '''
            model_name: gpt2
            train_path: training data path
            dev_path: validation data path
            test_path: test data path 
            max_len: maximum length for training sequences 
            sos_token: initialized sos token <-start_of_text->
            pad_token: used to pad the sequences <-pad->
            add_eos_token_to_data: whether we want to the model learn to generate eos token;
                if so, the model could automatically stop generation by generating eos token
        '''
        from transformers import GPT2TokenizerFast
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
        self.sos_token, self.sos_token_id = self.add_special_token(sos_token)
        print ('sos token is {}, sos token id is {}'.format(self.sos_token, self.sos_token_id))
        self.pad_token, self.pad_token_id = self.add_special_token(pad_token)
        print ('pad token is {}, pad token id is {}'.format(self.pad_token, self.pad_token_id))
        self.eos_token, self.eos_token_id = self.tokenizer.bos_token, self.tokenizer.bos_token_id
        print ('eos token is {}, eos token id is {}'.format(self.eos_token, self.eos_token_id))
        self.add_eos_token_to_data = add_eos_token_to_data

        self.max_len = max_len
        self.train_token_list, self.train_token_id_list = self.process_one_file(train_path)
        self.dev_token_list, self.dev_token_id_list = self.process_one_file(dev_path)
        self.test_token_list, self.test_token_id_list = self.process_one_file(test_path)
        self.train_num, self.dev_num, self.test_num = len(self.train_token_list), len(self.dev_token_list), \
        len(self.test_token_list)
        print ('train number:{}, dev number:{}, test number:{}'.format(self.train_num, self.dev_num, self.test_num))

        self.train_idx_list = [i for i in range(self.train_num)]
        random.shuffle(self.train_idx_list)
        self.dev_idx_list = [j for j in range(self.dev_num)]
        self.test_idx_list = [j for j in range(self.test_num)]
        self.dev_current_idx, self.test_current_idx = 0, 0

    def add_special_token(self, special_token):
        if special_token in self.tokenizer.vocab:
            print (special_token + ' token exists.')
        else:
            print ('Add token to the tokenizer.')
            print ('Original vocabulary size is {}'.format(len(self.tokenizer)))
            self.tokenizer.add_tokens([special_token])
            print ('Vocabulary size after extension is {}'.format(len(self.tokenizer)))
            assert len(self.tokenizer.convert_tokens_to_ids([special_token])) == 1
        special_token_id = self.tokenizer.convert_tokens_to_ids([special_token])[0]
        return special_token, special_token_id

    def process_one_file(self, path):
        print ('Processing {}'.format(path))
        with open(path) as f:
            item_list = json.load(f)
        lines = []
        for item in item_list:
            captions_list = item['captions']
            for one_caption in captions_list:
                lines.append(one_caption.strip())

        res_token_list, res_token_id_list = [], []
        n = len(lines)
        p = progressbar.ProgressBar(n)
        p.start()
        for i in range(n):
            p.update(i)
            text = lines[i].strip('\n')
            self.process_one_text(text, res_token_list, res_token_id_list)
        p.finish()
        print ('{} processed!'.format(path))
        return res_token_list, res_token_id_list

    def process_one_text(self, text, res_token_list, res_token_id_list):
        tokens = self.tokenizer.tokenize(text, max_length=self.max_len, truncation=True)
        if len(tokens) <= 1: # filter out too short sequence
            return
        tokens = [self.sos_token] + tokens[:self.max_len]
        if self.add_eos_token_to_data:
            tokens = tokens + [self.eos_token]
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        res_token_list.append(tokens)
        res_token_id_list.append(token_ids)
        return

    def pad_batch(self, batch_id_list):
        batch_id_list = [torch.LongTensor(item) for item in batch_id_list]
        batch_tensor = rnn.pad_sequence(batch_id_list, batch_first=True, padding_value=self.pad_token_id)
        batch_mask = torch.ones_like(batch_tensor)
        batch_mask = batch_mask.masked_fill(batch_tensor.eq(self.pad_token_id), 0.0).type(torch.FloatTensor)
        return batch_tensor, batch_mask

    def process_output(self, batch_tgt_id_list):
        batch_tgt_id_list = [torch.LongTensor(item) for item in batch_tgt_id_list]
        batch_tgt_tensor, _ = self.pad_batch(batch_tgt_id_list) # padded target sequence
        batch_tgt_input_tensor = batch_tgt_tensor[:, :-1].clone()
        batch_tgt_output_tensor = batch_tgt_tensor[:, 1:].clone()
        return batch_tgt_input_tensor, batch_tgt_output_tensor

    def parse_batch(self, batch_id_list):
        batch_input, batch_labels = self.process_output(batch_id_list)
        batch_labels[batch_labels[:, :] == self.pad_token_id] = -100
        return batch_input, batch_labels

    def get_next_train_batch(self, batch_size):
        batch_idx_list = random.sample(self.train_idx_list, batch_size)
        batch_id_list, batch_token_list = [], []

        for idx in batch_idx_list:
            batch_id_list.append(self.train_token_id_list[idx])
            batch_token_list.append(self.train_token_list[idx])
        batch_input_tensor, batch_labels = self.parse_batch(batch_id_list)
        return batch_input_tensor, batch_labels, batch_token_list

    def get_next_validation_batch(self, batch_size, mode):
        batch_id_list, batch_token_list = [], []
        if mode == 'dev':
            curr_select_idx, instance_num = self.dev_current_idx, self.dev_num
            tgt_token_id_list, tgt_token_list = self.dev_token_id_list, self.dev_token_list
        elif mode == 'test':
            curr_select_idx, instance_num = self.test_current_idx, self.test_num
            tgt_token_id_list, tgt_token_list = self.test_token_id_list, self.test_token_list
        else:
            raise Exception('Wrong Validation Mode!!!')

        if curr_select_idx + batch_size < instance_num:
            for i in range(batch_size):
                curr_idx = curr_select_idx + i
                batch_id_list.append(tgt_token_id_list[curr_idx])
                batch_token_list.append(tgt_token_list[curr_idx])
            if mode == 'dev':
                self.dev_current_idx += batch_size
            else:
                self.test_current_idx += batch_size
        else:
            for i in range(batch_size):
                curr_idx = curr_select_idx + i
                if curr_idx > instance_num - 1: 
                    curr_idx = 0
                    if mode == 'dev':
                        self.dev_current_idx = 0
                    else:
                        self.test_current_idx = 0
                batch_id_list.append(tgt_token_id_list[curr_idx])
                batch_token_list.append(tgt_token_list[curr_idx])
            if mode == 'dev':
                self.dev_current_idx = 0
            else:
                self.test_current_idx = 0
        batch_input_tensor, batch_labels = self.parse_batch(batch_id_list)
        return batch_input_tensor, batch_labels, batch_token_list

In [3]:
import sys
import os
import operator
from operator import itemgetter
import torch
from torch import nn
import torch.nn.functional as F
import random
import numpy as np
import argparse
import random

def parse_prompt(text):
    '''
        process the prompt text;
    '''
    eos_token = '<|endoftext|>'
    text = text.strip(eos_token).strip()
    left_bracket_idx, right_bracket_idx = -1, -1
    for idx in range(len(text)):
        char = text[idx]
        if char == '[' and left_bracket_idx == -1: # first [ is met
            left_bracket_idx = idx
        elif char == ']' and right_bracket_idx == -1: # first ] is met
            right_bracket_idx = idx
        else:
            pass
    res_text = ''
    remove = False
    if left_bracket_idx > -1 and right_bracket_idx > left_bracket_idx:
        if right_bracket_idx - left_bracket_idx <= 6:
            remove = True
        else:
            pass

    for idx in range(len(text)):
        if remove:
            if idx >= left_bracket_idx and idx <= right_bracket_idx:
                continue
            else:
                res_text += text[idx]
        else:
            res_text += text[idx]
    res_text = res_text.strip()
    res_text = ' '.join(res_text.split()).strip()
    return res_text

def typical_filtering(scores, mass, min_tokens_to_keep, filter_value):
    # calculate entropy
    normalized = torch.nn.functional.log_softmax(scores, dim=-1)
    p = torch.exp(normalized)
    ent = -(normalized * p).nansum(-1, keepdim=True)

    # shift and sort
    shifted_scores = torch.abs((-normalized) - ent)
    sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
    sorted_logits = scores.gather(-1, sorted_indices)
    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)

    # Remove tokens with cumulative mass above the threshold
    last_ind = (cumulative_probs < mass).sum(dim=1)
    last_ind[last_ind < 0] = 0
    sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
    if min_tokens_to_keep > 1:
        # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
        sorted_indices_to_remove[..., : min_tokens_to_keep] = 0
    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)

    scores = scores.masked_fill(indices_to_remove, filter_value)
    return scores

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-np.inf):
    assert logits.dim() == 1
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value
    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
        
    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value
    return logits

# ========== batch version ========= #
def ranking_fast(context_hidden, next_hidden, next_top_k_probs, alpha, beam_width):
    '''
        context_hidden: bsz*beam x seqlen x embed_dim
        next_hidden: bsz*beam x 1 x embed_dim
        next_top_k_probs: bsz x beam
    '''
    _, context_len, embed_dim = context_hidden.size()
    norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
    norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
    cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1,2)).squeeze(-1)    # [B*K, S]
    scores, _ = torch.max(cosine_matrix, dim=-1)    # [B*K]
    next_top_k_probs = next_top_k_probs.view(-1)    # [B*K]
    scores = (1.0 - alpha) * next_top_k_probs - alpha * scores 
    scores = torch.stack(torch.split(scores, beam_width))    # [B, K]
    selected_idx = scores.max(dim=-1)[1]    # [B]
    return selected_idx

def ContrastiveDecodingOneStepFast(
    model, 
    ids, 
    beam_width, 
    alpha, 
    past_key_values,
    last_hidden_states,
    vocab,
    logit_for_next_step,
    first_step=False,
    ):
    # input_ids: [B, S]
    if first_step:
        output = model(
            input_ids=ids, 
            past_key_values=past_key_values,
            use_cache=True,
            output_hidden_states=True
        )
        past_key_values = output.past_key_values
        last_hidden_states = output.hidden_states[-1]    # [B, S, E]
        logit_for_next_step = output.logits[:, -1, :]    # [B, V]
    bsz, seqlen, embed_dim = last_hidden_states.size()
    p = random.uniform(0, 1)

    next_probs = F.softmax(logit_for_next_step, dim=-1)
    _, top_k_ids = torch.topk(logit_for_next_step, dim=-1, k=beam_width)    # [B, K]
    top_k_probs = torch.gather(next_probs, dim=1, index=top_k_ids)    # [B, K]
    # compute new hidden
    past_key_values = enlarge_past_key_values(past_key_values, beam_width)
    output = model(
        input_ids=top_k_ids.view(-1, 1), 
        attention_mask=torch.ones_like(top_k_ids.view(-1, 1)),
        past_key_values=past_key_values,
        output_hidden_states=True,
        use_cache=True,
    )
    past_key_values = output.past_key_values
    logits = output.logits[:, -1, :]    # [B*K, V]
    next_hidden = output.hidden_states[-1]    # [B*K, 1, E]
    context_hidden = last_hidden_states.unsqueeze(1).expand(-1, beam_width, -1, -1).reshape(bsz*beam_width, seqlen, embed_dim)    # [B*K, S, E]

    selected_idx = ranking_fast(
        context_hidden, 
        next_hidden, 
        top_k_probs,    # [B, K] 
        alpha,
        beam_width,
    )     # [B]
    # prepare for the next step
    next_id = top_k_ids[range(len(top_k_ids)), selected_idx].unsqueeze(-1)    # [B, 1]
    next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), beam_width))    # [B, K, E]
    next_hidden = next_hidden[range(bsz), selected_idx, :]    # [B, E]
    last_hidden_states = torch.cat([last_hidden_states, next_hidden.unsqueeze(1)], dim=1)    # [B, S, E]
    past_key_values = select_past_key_values(past_key_values, beam_width, selected_idx)
    logits = torch.stack(torch.split(logits, beam_width))[range(bsz), selected_idx, :]    # [B, V]
    # next_id: [B, 1]
    return next_id, past_key_values, last_hidden_states, logits 

def enlarge_past_key_values(past_key_values, beam_width):
    # from [B, num_head, seq_len, esz] to [B*K, num_head, seq_len, esz]
    new_key_values = []
    for layer in past_key_values:
        items = []
        for item in layer:
            # item is the key and value matrix
            bsz, num_head, seq_len, esz = item.size()
            item = item.unsqueeze(1).expand(-1, beam_width, -1, -1, -1).reshape(bsz*beam_width, num_head, seq_len, esz)    # [bsz*beam, num_head, seq_len, esz]
            items.append(item)
        new_key_values.append(items)
    return new_key_values

def select_past_key_values(past_key_values, beam_width, selected_idx):
    '''select_idx: [B]'''
    new_key_values = []
    for layer in past_key_values:
        items = []
        for item in layer:
            bsz_and_beam, num_head, seq_len, esz = item.size()
            bsz = int(bsz_and_beam//beam_width)
            item = torch.stack(torch.split(item, beam_width, dim=0))    # [B, K, num_head, seq_len, esz] 
            item = item[range(bsz), selected_idx, :, :, :]   # [B, num_head, seq_len, esz]
            items.append(item)
        new_key_values.append(items)
    return new_key_values

# ========== fast plug and play version ========= #
def plug_and_play_fast_ranking(
    context_hidden, 
    next_hidden, 
    next_top_k_ids, 
    next_top_k_probs, 
    alpha, 
    beta, 
    batch_class_score,
    beam_width,
):
    '''
        context_hidden: beam_width x context_len x embed_dim
        next_hidden: beam_width x 1 x embed_dim
        next_top_k_ids: beam_width x 1
        batch_class_score: beam_width x 1
    '''
    _, context_len, embed_dim = context_hidden.size()
    norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
    norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
    cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1,2)).squeeze(-1)
    scores, _ = torch.max(cosine_matrix, dim = -1)
    next_top_k_probs = next_top_k_probs.view(-1)
    scores = (1.0 - alpha) * next_top_k_probs - alpha * scores + beta * batch_class_score.view([beam_width])
    scores = torch.stack(torch.split(scores, beam_width))
    selected_idx = scores.max(dim=-1)[1]
    return selected_idx

def PlugAndPlayContrastiveDecodingOneStepFast(model, input_ids, prefix_len, beam_width, alpha, beta, 
    simctg_tokenizer, image_embeds, clip, clip_text_max_len, past_key_values, last_hidden_states, 
    logit_for_next_step, first_step=False, input_ids_for_class=None):#, add_token_level_score=False):
    '''
        model: the generation model, e.g., gpt2
        input_ids: 1 x seqlen
    '''

    if first_step:
        output = model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True, output_hidden_states=True)
        past_key_values = output.past_key_values
        last_hidden_states = output.hidden_states[-1]    # [B, S, E]
        logit_for_next_step = output.logits[:, -1, :]    # [B, V]
    bsz, seqlen, embed_dim = last_hidden_states.size()
    next_probs = F.softmax(logit_for_next_step, dim = -1)
    _, top_k_ids = torch.topk(logit_for_next_step, dim = -1, k = beam_width)
    top_k_probs = torch.gather(next_probs, dim = 1, index=top_k_ids)

    # compute the new hidden
    past_key_values = enlarge_past_key_values(past_key_values, beam_width)
    output = model(
        input_ids=top_k_ids.view(-1, 1) ,
        attention_mask=torch.ones_like(top_k_ids.view(-1, 1)),
        past_key_values=past_key_values,
        output_hidden_states=True,
        use_cache=True,
    )
    past_key_values = output.past_key_values
    logits = output.logits[:, -1, :]
    next_hidden = output.hidden_states[-1]
    context_hidden = last_hidden_states.unsqueeze(1).expand(-1, beam_width, -1, -1).reshape(bsz*beam_width, seqlen, embed_dim)
    
    # prepare for the classification model
    input_ids_for_class_ = torch.cat([
        input_ids_for_class.unsqueeze(1).expand(-1, beam_width, -1).reshape(bsz*beam_width, seqlen),
        top_k_ids.view(-1, 1)
        ], dim=-1
    )

    batch_text_list = []
    for one_input_id in input_ids_for_class_:
        one_text = simctg_tokenizer.decode(one_input_id[prefix_len:][-clip_text_max_len:]) 
        # we only consider the class score of the generated text continuation
        batch_text_list.append(one_text)
    batch_score = clip.compute_image_text_similarity_via_raw_text(image_embeds, batch_text_list)

    selected_idx = plug_and_play_fast_ranking(
        context_hidden, 
        next_hidden, 
        top_k_ids, 
        top_k_probs, 
        alpha, 
        beta, 
        batch_score,
        beam_width,
    )       

    # prepare for the next step
    next_id = top_k_ids[range(len(top_k_ids)), selected_idx].unsqueeze(-1)
    next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), beam_width))
    next_hidden = next_hidden[range(bsz), selected_idx, :]
    last_hidden_states = torch.cat([last_hidden_states, next_hidden.unsqueeze(1)], dim=1)
    past_key_values = select_past_key_values(past_key_values, beam_width, selected_idx)
    logits = torch.stack(torch.split(logits, beam_width))[range(bsz), selected_idx, :]
    input_ids_for_class = torch.cat([input_ids_for_class, next_id], dim=-1)
    return next_id, past_key_values, last_hidden_states, logits, input_ids_for_class

In [4]:
#trainer.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
import argparse, os
import random
import numpy as np
import time
import logging
import progressbar

import logging
logging.getLogger('transformers.generation_utils').disabled = True

def eval_model(args, model, data, cuda_available, device):
    dataset_batch_size = args.batch_size_per_gpu * args.number_of_gpu
    eval_step = int(data.test_num / dataset_batch_size) + 1
    val_loss, token_sum = 0., 0.
    model.eval()
    with torch.no_grad():
        p = progressbar.ProgressBar(eval_step)
        p.start()
        for idx in range(eval_step):
            p.update(idx)
            batch_input_tensor, batch_labels, _ = \
            data.get_next_validation_batch(batch_size=dataset_batch_size, mode='test')
            if cuda_available:
                batch_input_tensor = batch_input_tensor.cuda(device)
                batch_labels = batch_labels.cuda(device)
            one_val_loss, one_val_token_sum = model.eval_loss(batch_input_tensor, batch_labels)
            one_val_loss = torch.sum(one_val_loss)
            one_val_token_sum = torch.sum(one_val_token_sum)
            val_loss += one_val_loss.item()
            token_sum += one_val_token_sum.item()
        p.finish()
    model.train()
    val_loss = val_loss / token_sum
    return val_loss

def model_training(args, data, model, total_steps, print_every, save_every, ckpt_save_path, cuda_available, device):
    import os
    if os.path.exists(ckpt_save_path):
        pass
    else: # recursively construct directory
        os.makedirs(ckpt_save_path, exist_ok=True)

    max_save_num = 1

    batch_size_per_gpu, gradient_accumulation_steps, number_of_gpu, effective_batch_size = \
    args.batch_size_per_gpu, args.gradient_accumulation_steps, args.number_of_gpu, args.effective_batch_size
    assert effective_batch_size == batch_size_per_gpu * gradient_accumulation_steps * number_of_gpu

    warmup_steps = int(0.1 * total_steps) # 10% of training steps are used for warmup
    print ('total training steps is {}, warmup steps is {}'.format(total_steps, warmup_steps))
    from transformers.optimization import AdamW, get_linear_schedule_with_warmup
    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    optimizer.zero_grad()

    effective_batch_acm = 0
    all_batch_step = 1
    print_valid, save_valid = False, False
    train_loss, train_cl_loss, min_val_loss = 0., 0., 1e10
    train_ave_bleu = 0.

    print ('--------------------------------------------------------------------------')
    print ('Start Training:')
    model.train()
    number_of_saves = 0

    while effective_batch_acm < total_steps:
        all_batch_step += 1
        train_batch_input_tensor, train_batch_labels, _ = data.get_next_train_batch(batch_size_per_gpu * number_of_gpu)
        if cuda_available:
            train_batch_input_tensor = train_batch_input_tensor.cuda(device)
            train_batch_labels = train_batch_labels.cuda(device)
        mle_loss, cl_loss = model(train_batch_input_tensor, train_batch_labels, args.margin)

        loss = mle_loss + cl_loss
        loss = loss.mean()
        loss.backward()
        train_loss += mle_loss.item()
        train_cl_loss += cl_loss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        # parameter update
        if all_batch_step % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            effective_batch_acm += 1
            print_valid, save_valid = True, True

        # print intermediate result
        if effective_batch_acm % print_every == 0 and print_valid:
            denominator = (effective_batch_acm - (number_of_saves * save_every)) * gradient_accumulation_steps
            one_train_loss = train_loss / denominator
            one_train_cl_loss = train_cl_loss / denominator
            print ('At training steps {}, training MLE loss is {}, train CL loss is {}'.format(effective_batch_acm, 
                one_train_loss, one_train_cl_loss))
            print_valid = False

        # saving result
        if effective_batch_acm % save_every == 0 and save_valid:
            number_of_saves += 1

            save_valid = False
            one_train_loss = train_loss / (save_every * gradient_accumulation_steps)
            one_train_cl_loss = train_cl_loss / (save_every * gradient_accumulation_steps)

            model.eval()
            one_val_loss = eval_model(args, model, data, cuda_available, device)
            model.train()

            print ('At training steps {}, training MLE loss is {}, train CL loss is {}, validation loss is {}'.format(effective_batch_acm, 
                one_train_loss, one_train_cl_loss, one_val_loss))

            train_loss, train_cl_loss = 0., 0.

            if one_val_loss < min_val_loss:
                # in finetuning stage, we always save the model
                min_val_loss = min(one_val_loss, min_val_loss)
                print ('Saving model...')
                one_val_ppl = np.exp(one_val_loss)
                one_val_ppl = round(one_val_ppl, 3)
                save_name = 'training_step_{}_train_mle_loss_{}_train_cl_loss_{}_dev_loss_{}_dev_ppl_{}'.format(effective_batch_acm,
                round(one_train_loss,5), round(one_train_cl_loss,5), round(one_val_loss,5), one_val_ppl)

                model_save_path = ckpt_save_path + '/' + save_name
                import os
                if os.path.exists(model_save_path):
                    pass
                else: # recursively construct directory
                    os.makedirs(model_save_path, exist_ok=True)
                if cuda_available and torch.cuda.device_count() > 1:
                    model.module.save_model(model_save_path)
                else:
                    model.save_model(model_save_path)
                print ('Model Saved!')

                # --------------------------------------------------------------------------------------------- #
                # removing extra checkpoints...
                import os
                from operator import itemgetter
                fileData = {}
                test_output_dir = ckpt_save_path
                for fname in os.listdir(test_output_dir):
                    if fname.startswith('training_step'):
                        fileData[fname] = os.stat(test_output_dir + '/' + fname).st_mtime
                    else:
                        pass
                sortedFiles = sorted(fileData.items(), key=itemgetter(1))

                if len(sortedFiles) < max_save_num:
                    pass
                else:
                    delete = len(sortedFiles) - max_save_num
                    for x in range(0, delete):
                        one_folder_name = test_output_dir + '/' + sortedFiles[x][0]
                        os.system('rm -r ' + one_folder_name)
                print ('-----------------------------------')
                # --------------------------------------------------------------------------------------------- #
    return model

In [5]:
#loss_func.py
import torch

def compute_valid_token_num(valid_len_list):
    res = 0
    for one_len in valid_len_list:
        res += one_len * (one_len - 1)
    return res

def build_mask_matrix(seqlen, valid_len_list, prefix_len = 0):
    '''
        prefix_len: the length of prefix that we do not want to compute CL loss for.
        (1) if a sequence of length 4 contains zero padding token (i.e., the valid length is 4),
            then the loss padding matrix looks like
                 [0., 1., 1., 1.],
                 [1., 0., 1., 1.],
                 [1., 1., 0., 1.],
                 [1., 1., 1., 0.]
        (2) if a sequence of length 4 contains 1 padding token (i.e., the valid length is 3),
            then the loss padding matrix looks like
                 [0., 1., 1., 0.],
                 [1., 0., 1., 0.],
                 [1., 1., 0., 0.],
                 [0., 0., 0., 0.]
    '''
    res_list = []
    base_mask = torch.ones(seqlen, seqlen) - torch.eye(seqlen, seqlen)
    base_mask = base_mask.type(torch.FloatTensor)
    bsz = len(valid_len_list)
    for i in range(bsz):
        one_base_mask = base_mask.clone()
        one_valid_len = valid_len_list[i]
        one_base_mask[:,one_valid_len:] = 0.
        one_base_mask[one_valid_len:, :] = 0.
        if prefix_len > 0:
            one_base_mask[:prefix_len, :prefix_len] = 0.
        res_list.append(one_base_mask)
    res_mask = torch.stack(res_list, dim = 0)#torch.FloatTensor(res_list)
    #print (res_mask)
    assert res_mask.size() == torch.Size([bsz, seqlen, seqlen])
    return res_mask
        
def contrastive_loss(margin, score_matrix, input_ids, pad_token_id, prefix_len=0):
    '''
       margin: predefined margin to push similarity score away
       score_matrix: bsz x seqlen x seqlen
       input_ids: bsz x seqlen
       pad_token_id: indicating which tokens are padding token
    '''
    bsz, seqlen, _ = score_matrix.size()
    gold_score = torch.diagonal(score_matrix, offset=0, dim1=1, dim2=2) # bsz x seqlen
    gold_score = torch.unsqueeze(gold_score, -1)
    assert gold_score.size() == torch.Size([bsz, seqlen, 1])
    difference_matrix = gold_score - score_matrix
    assert difference_matrix.size() == torch.Size([bsz, seqlen, seqlen])
    loss_matrix = margin - difference_matrix # bsz x seqlen x seqlen
    loss_matrix = torch.nn.functional.relu(loss_matrix)

    ### input mask
    input_mask = torch.ones_like(input_ids).type(torch.FloatTensor)
    if loss_matrix.is_cuda:
        input_mask = input_mask.cuda(loss_matrix.get_device())
    input_mask = input_mask.masked_fill(input_ids.eq(pad_token_id), 0.0)

    if loss_matrix.is_cuda:
        input_mask = input_mask.cuda(loss_matrix.get_device())

    valid_len_list = torch.sum(input_mask, dim = -1).tolist()
    loss_mask = build_mask_matrix(seqlen, [int(item) for item in valid_len_list], prefix_len)
    if score_matrix.is_cuda:
        loss_mask = loss_mask.cuda(score_matrix.get_device())
    masked_loss_matrix = loss_matrix * loss_mask

    loss_matrix = torch.sum(masked_loss_matrix, dim = -1)
    assert loss_matrix.size() == input_ids.size()
    loss_matrix = loss_matrix * input_mask
    cl_loss = torch.sum(loss_matrix) / torch.sum(loss_mask)
    return cl_loss

In [6]:
#simCTG.py
import os
import sys
import operator
from tqdm import tqdm
from operator import itemgetter
import torch
from torch import nn
import random
import argparse
import numpy as np
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
#from loss_func import contrastive_loss

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import datetime

train_fct = CrossEntropyLoss()
val_fct = CrossEntropyLoss(reduction='none')
class SimCTG(nn.Module):
    def __init__(self, model_name, sos_token, pad_token):
        super(SimCTG, self).__init__()
        from transformers import AutoTokenizer, GPT2LMHeadModel
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.sos_token, self.sos_token_id = self.add_special_token(sos_token)
        print ('sos token is {}, sos token id is {}'.format(self.sos_token, self.sos_token_id))
        self.pad_token, self.pad_token_id = self.add_special_token(pad_token)
        print ('pad token is {}, pad token id is {}'.format(self.pad_token, self.pad_token_id))
        self.eos_token, self.eos_token_id = self.tokenizer.bos_token, self.tokenizer.bos_token_id
        print ('eos token is {}, eos token id is {}'.format(self.eos_token, self.eos_token_id))
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.vocab_size = len(self.tokenizer)
        print ('Resizing model embedding...')
        self.model.resize_token_embeddings(len(self.tokenizer)) 
        print ('Model embedding resized!')
        self.embed_dim = self.model.config.hidden_size

    def add_special_token(self, special_token):
        if special_token in self.tokenizer.vocab:
            print (special_token + ' token exists.')
        else:
            print ('Add token to the tokenizer.')
            print ('Original vocabulary size is {}'.format(len(self.tokenizer)))
            self.tokenizer.add_tokens([special_token])
            print ('Vocabulary size after extension is {}'.format(len(self.tokenizer)))
            assert len(self.tokenizer.convert_tokens_to_ids([special_token])) == 1
        special_token_id = self.tokenizer.convert_tokens_to_ids([special_token])[0]
        return special_token, special_token_id

    def compute_logits_and_hidden_states(self, input_ids):
        # used for advanced decoding
        # input_ids: 1 x seqlen
        outputs = self.model(input_ids=input_ids, output_hidden_states=True)
        last_hidden_states = outputs.hidden_states[-1]
        logits = outputs.logits
        return last_hidden_states, logits

    def forward(self, input_ids, labels, margin):
        bsz, seqlen = input_ids.size()
        outputs = self.model(input_ids=input_ids, output_hidden_states=True)
        logits = outputs.logits
        assert logits.size() == torch.Size([bsz, seqlen, self.vocab_size])
        last_hidden_states = outputs.hidden_states[-1]
        assert last_hidden_states.size() == torch.Size([bsz, seqlen, self.embed_dim])
        mle_loss = train_fct(logits.view(-1, self.vocab_size), labels.view(-1))

        norm_rep = last_hidden_states / last_hidden_states.norm(dim=2, keepdim=True)
        cosine_scores = torch.matmul(norm_rep, norm_rep.transpose(1,2)) 
        assert cosine_scores.size() == torch.Size([bsz, seqlen, seqlen])
        cl_loss = contrastive_loss(margin, cosine_scores, input_ids, self.pad_token_id, prefix_len=0)
        return mle_loss, cl_loss

    def eval_loss(self, input_ids, labels):
        bsz, seqlen = input_ids.size()
        outputs = self.model(input_ids=input_ids, output_hidden_states=True)
        logits = outputs.logits
        assert logits.size() == torch.Size([bsz, seqlen, self.vocab_size])
        last_hidden_states = outputs.hidden_states[-1]
        assert last_hidden_states.size() == torch.Size([bsz, seqlen, self.embed_dim])
        mle_loss = val_fct(logits.view(-1, self.vocab_size), labels.view(-1))
        assert mle_loss.size() == torch.Size([bsz * seqlen])
        mask_tmp = labels.masked_fill(~labels.eq(-100), 1.0)
        mask = mask_tmp.masked_fill(mask_tmp.eq(-100), 0.0)
        # sum 
        mle_loss_sum = torch.sum(mle_loss)
        token_num_sum = torch.sum(mask)
        return mle_loss_sum, token_num_sum

    def save_model(self, ckpt_save_path):
        import os
        if os.path.exists(ckpt_save_path):
            pass
        else: # recursively construct directory
            os.makedirs(ckpt_save_path, exist_ok=True)
        # save model
        self.model.save_pretrained(ckpt_save_path)
        # save tokenizer
        self.tokenizer.save_pretrained(ckpt_save_path)

    def parse_sentences(self, text, num_of_sentences_to_keep):
        item_list = text.split('.')
        res_list = item_list[:num_of_sentences_to_keep]
        if len(item_list) > num_of_sentences_to_keep:
            res_text = '.'.join(res_list).strip('.') + '.'
        else:
            res_text = '.'.join(res_list).strip('.').strip()
        return res_text

    def parse_generated_result(self, output, num_of_sentences_to_keep):
        output_text = self.tokenizer.decode(output)
        item_list = output_text.split(self.eos_token)
        full_text = self.eos_token.join(item_list[:2]).strip()
        full_text = self.parse_sentences(full_text, num_of_sentences_to_keep)
        generated_text = item_list[1].strip()
        generated_text = self.parse_sentences(generated_text, num_of_sentences_to_keep)
        return full_text, generated_text

    # decoding functions
    # ------------------------------------------------------- #

    def parse_output_token_list(self, output):
        output = output.tolist()
        res_list = []
        for token_id in output:
            if token_id == self.sos_token_id:
                continue
            elif token_id == self.eos_token_id:
                break
            else:
                res_list.append(token_id)
        text = self.tokenizer.decode(res_list).strip()
        return ' '.join(text.split()).strip()

    @torch.no_grad()
    def magic_search(self, input_ids, beam_width, alpha, decoding_len, beta, image_instance, clip, 
        clip_text_max_len):#, add_token_level_score=False):
        prefix_len = input_ids.size()[1]
        from utlis import PlugAndPlayContrastiveDecodingOneStepFast
        past_key_values, last_hidden_states, logits = None, None, None
        generated = [item for item in input_ids.tolist()]
        input_ids_for_class = input_ids.clone()

        image_embeds = clip.compute_image_representation_from_image_instance(image_instance)

        start_time = datetime.datetime.now()

        # the maximum supported length of generation for SimCTG is 256
        # to support longer generated length, you can re-train the SimCTG model with longer sequences
        decoding_len = decoding_len - prefix_len
        for step in range(decoding_len):
            input_ids, past_key_values, last_hidden_states, logits, input_ids_for_class = \
            PlugAndPlayContrastiveDecodingOneStepFast(
                self.model, 
                input_ids, 
                prefix_len,
                beam_width, 
                alpha, 
                beta, 
                self.tokenizer,
                image_embeds, 
                clip, 
                clip_text_max_len,
                past_key_values,
                last_hidden_states,
                logits,
                first_step=step==0,
                input_ids_for_class=input_ids_for_class,
            )
        end_time = datetime.datetime.now()
        time_diff = (end_time - start_time)
        execution_time = time_diff.total_seconds() * 1000
        return self.parse_output_token_list(input_ids_for_class[0])

    def fast_contrastive_search(self, input_ids, beam_width, alpha, decoding_len):
        '''
           input_ids: prefix input; 1 x prefix_len
           decoding_len: how many tokens to generate
           beam_width: size of candidate pool during decoding
           alpha: regulates importance of model confidence and degeneration penalty
        '''
        self.model.eval()
        from utlis import ContrastiveDecodingOneStepFast
        # sanity check
        assert alpha >= 0. and alpha <= 1.0
        
        # fast mode
        prefix_len = input_ids.size()[1]
        batch_size, seqlen = input_ids.size()
        #generated = [[] for _ in range(batch_size)]
        generated = [item for item in input_ids.tolist()]
        past_key_values = None
        last_hidden_states = None
        logits = None
        decoding_len = decoding_len - prefix_len
        for step in range(decoding_len):
            input_ids, past_key_values, last_hidden_states, logits = ContrastiveDecodingOneStepFast(
                self.model,
                input_ids,
                beam_width,
                alpha,
                past_key_values,
                last_hidden_states,
                self.tokenizer,
                logits,
                first_step=step == 0,
            )
            tokens = input_ids.squeeze(dim=-1).tolist()
            for idx, t in enumerate(tokens):
                generated[idx].append(t)
        return self.parse_output_token_list(torch.LongTensor(generated[0]))

    def top_k_sampling(self, input_ids, k, decoding_len):
        _, prefix_len = input_ids.size()
        output = self.model.generate(
                            input_ids, 
                            do_sample=True, 
                            max_length=decoding_len, 
                            top_p=1.0,
                            top_k=k)
        return self.parse_output_token_list(output[0])

    def nucleus_sampling(self, input_ids, nucleus_p, decoding_len):
        _, prefix_len = input_ids.size()
        output = self.model.generate(
                            input_ids, 
                            do_sample=True, 
                            max_length=decoding_len, 
                            top_p=nucleus_p,
                            top_k=0)
        return self.parse_output_token_list(output[0])


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
import argparse, os
import random
import numpy as np
import time
import logging
import progressbar

import logging
logging.getLogger('transformers.generation_utils').disabled = True
import argparse
if __name__ == '__main__':
    if torch.cuda.is_available():
        print ('Cuda is available.')
    cuda_available = torch.cuda.is_available()
    multi_gpu_training = False
    if cuda_available:
        if torch.cuda.device_count() > 1:
            multi_gpu_training = True
            print ('Using Multi-GPU training, number of GPU is {}'.format(torch.cuda.device_count()))
        else:
            print ('Using single GPU training.')
    else:
        pass
    
    device = torch.device('cuda')
    model_name = args.model_name

    sos_token, pad_token = r'<-start_of_text->', r'<-pad->'
    add_eos_token_to_data = args.add_eos_token_to_data
    if add_eos_token_to_data == 'True':
        add_eos_token_to_data = True
        print ('Add eos token to data!')
    elif add_eos_token_to_data == 'False':
        add_eos_token_to_data = False
        print ('Do not add eos token to data!')
    else:
        raise Exception('Wrong eos configuration for data!!!')
    print ('Loading data...')
    #from dataclass import Data
    data = Data(model_name, args.train_path, args.dev_path, args.test_path, args.max_len, 
        sos_token, pad_token, add_eos_token_to_data)
    print ('Data loaded.')

    #from trainer import model_training
    print ('############################################################')
    print ('Start Training...')
    #from simctg import SimCTG
    print ('Initializaing SimCTG model...')
    model = SimCTG(model_name, sos_token, pad_token)
    if cuda_available:
        if multi_gpu_training:
            model = nn.DataParallel(model) # multi-gpu training
        else:
            pass
        model = model.to(device)
    else:
        pass
    print ('Model loaded') 
    total_steps, print_every, save_every = args.total_steps, args.print_every, args.save_every
    ckpt_save_path = args.save_path_prefix
    model = model_training(args, data, model, total_steps, print_every, save_every, 
        ckpt_save_path, cuda_available, device)
    print ('Training stage completed!')
    print ('############################################################')

Cuda is available.
Using single GPU training.
Add eos token to data!
Loading data...


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Add token to the tokenizer.
Original vocabulary size is 50257
Vocabulary size after extension is 50258
sos token is <-start_of_text->, sos token id is 50257
Add token to the tokenizer.
Original vocabulary size is 50258
Vocabulary size after extension is 50259
pad token is <-pad->, pad token id is 50258
eos token is <|endoftext|>, eos token id is 50256
Processing /kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_train.json


| |       #                                      | 144999 Elapsed Time: 0:00:17
- | #                                               | 882 Elapsed Time: 0:00:00

/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_train.json processed!
Processing /kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_val.json


| |     #                                          | 5069 Elapsed Time: 0:00:00
- | #                                               | 878 Elapsed Time: 0:00:00

/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_val.json processed!
Processing /kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_test.json


| |     #                                          | 4999 Elapsed Time: 0:00:00


/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_test.json processed!
train number:145000, dev number:5070, test number:5000
Data loaded.
############################################################
Start Training...
Initializaing SimCTG model...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before t

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Resizing model embedding...
Model embedding resized!
Model loaded
total training steps is 10000, warmup steps is 1000
--------------------------------------------------------------------------
Start Training:




At training steps 50, training MLE loss is 83.92462783813477, train CL loss is 0.44443097040057183
At training steps 100, training MLE loss is 72.84322461128235, train CL loss is 0.4362902021408081
At training steps 150, training MLE loss is 53.687166572411854, train CL loss is 0.34938953834275405
At training steps 200, training MLE loss is 41.61146593719721, train CL loss is 0.3153571259789169


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 250, training MLE loss is 34.08618326663971, train CL loss is 0.29354951579868793


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 250, training MLE loss is 34.08618326663971, train CL loss is 0.29354951579868793, validation loss is 3.5151402564134524
Saving model...
Model Saved!
-----------------------------------
At training steps 300, training MLE loss is 3.592324674129486, train CL loss is 0.18780366599559783
At training steps 350, training MLE loss is 3.513131790161133, train CL loss is 0.17365160498768092
At training steps 400, training MLE loss is 3.450180763800939, train CL loss is 0.1557591376826167
At training steps 450, training MLE loss is 3.408612800836563, train CL loss is 0.1397604699851945


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 500, training MLE loss is 3.371379598617554, train CL loss is 0.12458348222076893


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 500, training MLE loss is 3.371379598617554, train CL loss is 0.12458348222076893, validation loss is 3.054257986131999
Saving model...
Model Saved!
-----------------------------------
At training steps 550, training MLE loss is 3.184923678636551, train CL loss is 0.04803856929764152
At training steps 600, training MLE loss is 3.16038372695446, train CL loss is 0.0438922195835039
At training steps 650, training MLE loss is 3.139466874996821, train CL loss is 0.041009602152432004
At training steps 700, training MLE loss is 3.123450574874878, train CL loss is 0.03889434695942327


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 750, training MLE loss is 3.113983025312424, train CL loss is 0.03705008470267057


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 750, training MLE loss is 3.113983025312424, train CL loss is 0.03705008470267057, validation loss is 2.9177996358453493
Saving model...
Model Saved!
-----------------------------------
At training steps 800, training MLE loss is 3.042651650905609, train CL loss is 0.027111453879624606
At training steps 850, training MLE loss is 3.035914190411568, train CL loss is 0.026514992411248387
At training steps 900, training MLE loss is 3.031106897989909, train CL loss is 0.0258994620355467
At training steps 950, training MLE loss is 3.02278921186924, train CL loss is 0.02517482613446191


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1000, training MLE loss is 3.0111970188617705, train CL loss is 0.02448700226470828


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 1000, training MLE loss is 3.0111970188617705, train CL loss is 0.02448700226470828, validation loss is 2.857609181242913
Saving model...
Model Saved!
-----------------------------------
At training steps 1050, training MLE loss is 2.9674647784233095, train CL loss is 0.020417435094714165
At training steps 1100, training MLE loss is 2.9685023760795595, train CL loss is 0.020571842447388917
At training steps 1150, training MLE loss is 2.95710551738739, train CL loss is 0.02023087680339813
At training steps 1200, training MLE loss is 2.952403658926487, train CL loss is 0.019910916064400225


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1250, training MLE loss is 2.9460953993797303, train CL loss is 0.019506474005058408


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 1250, training MLE loss is 2.9460953993797303, train CL loss is 0.019506474005058408, validation loss is 2.8262295286135233
Saving model...
Model Saved!
-----------------------------------
At training steps 1300, training MLE loss is 2.900182839632034, train CL loss is 0.017074375608935953
At training steps 1350, training MLE loss is 2.9033000218868255, train CL loss is 0.017205944922752677
At training steps 1400, training MLE loss is 2.901982849438985, train CL loss is 0.01702415685945501
At training steps 1450, training MLE loss is 2.900348283946514, train CL loss is 0.016782414014451207


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1500, training MLE loss is 2.892314726829529, train CL loss is 0.016625919630751013


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 1500, training MLE loss is 2.892314726829529, train CL loss is 0.016625919630751013, validation loss is 2.8019682970374427
Saving model...
Model Saved!
-----------------------------------
At training steps 1550, training MLE loss is 2.8704008090496065, train CL loss is 0.01529899430461228
At training steps 1600, training MLE loss is 2.8702472406625748, train CL loss is 0.01513201936846599
At training steps 1650, training MLE loss is 2.8681112058957416, train CL loss is 0.015091005633585154
At training steps 1700, training MLE loss is 2.8651502364873886, train CL loss is 0.014923128851223736


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1750, training MLE loss is 2.8614171261787416, train CL loss is 0.014708523183129728


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 1750, training MLE loss is 2.8614171261787416, train CL loss is 0.014708523183129728, validation loss is 2.7791054178771764
Saving model...
Model Saved!
-----------------------------------
At training steps 1800, training MLE loss is 2.8388427472114564, train CL loss is 0.013510034438222646
At training steps 1850, training MLE loss is 2.8390036702156065, train CL loss is 0.013490996335167437
At training steps 1900, training MLE loss is 2.8416985273361206, train CL loss is 0.01341573273918281
At training steps 1950, training MLE loss is 2.8401260021328927, train CL loss is 0.013257704223506153


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2000, training MLE loss is 2.8384567608833313, train CL loss is 0.013172543717548252


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 2000, training MLE loss is 2.8384567608833313, train CL loss is 0.013172543717548252, validation loss is 2.766005718898537
Saving model...
Model Saved!
-----------------------------------
At training steps 2050, training MLE loss is 2.827410578727722, train CL loss is 0.012883469117805363
At training steps 2100, training MLE loss is 2.8272238582372666, train CL loss is 0.012678578358609228
At training steps 2150, training MLE loss is 2.827666489283244, train CL loss is 0.01256366740136097
At training steps 2200, training MLE loss is 2.82239592730999, train CL loss is 0.012377224022056908


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2250, training MLE loss is 2.8192341213226317, train CL loss is 0.012233124196529388


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 2250, training MLE loss is 2.8192341213226317, train CL loss is 0.012233124196529388, validation loss is 2.7555367635917354
Saving model...
Model Saved!
-----------------------------------
At training steps 2300, training MLE loss is 2.81224271774292, train CL loss is 0.011769840074703098
At training steps 2350, training MLE loss is 2.8072637128829956, train CL loss is 0.011619221034925431
At training steps 2400, training MLE loss is 2.802893200715383, train CL loss is 0.011492114181940754
At training steps 2450, training MLE loss is 2.8034437239170074, train CL loss is 0.011441936650080607


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2500, training MLE loss is 2.800370178222656, train CL loss is 0.011356949927285314


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 2500, training MLE loss is 2.800370178222656, train CL loss is 0.011356949927285314, validation loss is 2.747666124716369
Saving model...
Model Saved!
-----------------------------------
At training steps 2550, training MLE loss is 2.795918744802475, train CL loss is 0.01083222493994981
At training steps 2600, training MLE loss is 2.7981826668977736, train CL loss is 0.010667047089664266
At training steps 2650, training MLE loss is 2.792562716801961, train CL loss is 0.01065063317772001
At training steps 2700, training MLE loss is 2.791425256729126, train CL loss is 0.010574893637094646


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2750, training MLE loss is 2.793423658847809, train CL loss is 0.010532991450745613


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 2750, training MLE loss is 2.793423658847809, train CL loss is 0.010532991450745613, validation loss is 2.737674119031025
Saving model...
Model Saved!
-----------------------------------
At training steps 2800, training MLE loss is 2.787177857160568, train CL loss is 0.010515369405038655
At training steps 2850, training MLE loss is 2.78814116358757, train CL loss is 0.010501355009619146
At training steps 2900, training MLE loss is 2.785188220739365, train CL loss is 0.010438585987625022
At training steps 2950, training MLE loss is 2.7830337527394295, train CL loss is 0.010362968878471292


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3000, training MLE loss is 2.781862416982651, train CL loss is 0.010309814175590872


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3000, training MLE loss is 2.781862416982651, train CL loss is 0.010309814175590872, validation loss is 2.7335590921646116
Saving model...
Model Saved!
-----------------------------------
At training steps 3050, training MLE loss is 2.76361465215683, train CL loss is 0.00997621861519292
At training steps 3100, training MLE loss is 2.7660193622112272, train CL loss is 0.0098508949985262
At training steps 3150, training MLE loss is 2.76974781870842, train CL loss is 0.009873347042594105
At training steps 3200, training MLE loss is 2.7673635298013686, train CL loss is 0.0098778194678016


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3250, training MLE loss is 2.763581642150879, train CL loss is 0.00981132491491735


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3250, training MLE loss is 2.763581642150879, train CL loss is 0.00981132491491735, validation loss is 2.7277971083312766
Saving model...
Model Saved!
-----------------------------------
At training steps 3300, training MLE loss is 2.7584842908382416, train CL loss is 0.009503593598492444
At training steps 3350, training MLE loss is 2.7620927190780638, train CL loss is 0.00947822299436666
At training steps 3400, training MLE loss is 2.764630165497462, train CL loss is 0.009452839442528784
At training steps 3450, training MLE loss is 2.769488011598587, train CL loss is 0.009476210117572919


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3500, training MLE loss is 2.765860366344452, train CL loss is 0.009443317642435431


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3500, training MLE loss is 2.765860366344452, train CL loss is 0.009443317642435431, validation loss is 2.7236622967842505
Saving model...
Model Saved!
-----------------------------------
At training steps 3550, training MLE loss is 2.738622440099716, train CL loss is 0.009364554595667868
At training steps 3600, training MLE loss is 2.7500449538230898, train CL loss is 0.009269248078344389
At training steps 3650, training MLE loss is 2.7481330275535583, train CL loss is 0.009234349229373037
At training steps 3700, training MLE loss is 2.747271551191807, train CL loss is 0.009202250353991985


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3750, training MLE loss is 2.7494402050971987, train CL loss is 0.009178293886128813


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3750, training MLE loss is 2.7494402050971987, train CL loss is 0.009178293886128813, validation loss is 2.7173725790278604
Saving model...
Model Saved!
-----------------------------------
At training steps 3800, training MLE loss is 2.7407121777534487, train CL loss is 0.008862356750760227
At training steps 3850, training MLE loss is 2.744905315041542, train CL loss is 0.008815132634481415
At training steps 3900, training MLE loss is 2.7435131537914277, train CL loss is 0.008903665233713885
At training steps 3950, training MLE loss is 2.7441676545143125, train CL loss is 0.00889221417834051


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4000, training MLE loss is 2.745231878042221, train CL loss is 0.00885818592319265


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 4000, training MLE loss is 2.745231878042221, train CL loss is 0.00885818592319265, validation loss is 2.7148893380566217
Saving model...
Model Saved!
-----------------------------------
At training steps 4050, training MLE loss is 2.742732762098312, train CL loss is 0.008839131998829543
At training steps 4100, training MLE loss is 2.735285061597824, train CL loss is 0.008749975003302097
At training steps 4150, training MLE loss is 2.73658722837766, train CL loss is 0.00869693385825182
At training steps 4200, training MLE loss is 2.734177041053772, train CL loss is 0.008616921213106253


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4250, training MLE loss is 2.735126318693161, train CL loss is 0.00856752954935655


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 4250, training MLE loss is 2.735126318693161, train CL loss is 0.00856752954935655, validation loss is 2.7102826596639256
Saving model...
Model Saved!
-----------------------------------
At training steps 4300, training MLE loss is 2.7232016348838806, train CL loss is 0.008527179346419871
At training steps 4350, training MLE loss is 2.7384156328439713, train CL loss is 0.008484955043531955
At training steps 4400, training MLE loss is 2.7344014767805733, train CL loss is 0.008439646496747931
At training steps 4450, training MLE loss is 2.7294409558176995, train CL loss is 0.008369186226045712


- | #                                                 | 1 Elapsed Time: 0:00:00

At training steps 4500, training MLE loss is 2.7303160026073456, train CL loss is 0.008359922855626792


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 4500, training MLE loss is 2.7303160026073456, train CL loss is 0.008359922855626792, validation loss is 2.70566608852855
Saving model...
Model Saved!
-----------------------------------
At training steps 4550, training MLE loss is 2.730186598300934, train CL loss is 0.008355398597195744
At training steps 4600, training MLE loss is 2.7265275639295576, train CL loss is 0.008326253674458712
At training steps 4650, training MLE loss is 2.724432614247004, train CL loss is 0.00822322440566495
At training steps 4700, training MLE loss is 2.7197642180323602, train CL loss is 0.008157024655956776


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4750, training MLE loss is 2.7201459259986875, train CL loss is 0.008148226924240588


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 4750, training MLE loss is 2.7201459259986875, train CL loss is 0.008148226924240588, validation loss is 2.7030218033956315
Saving model...
Model Saved!
-----------------------------------
At training steps 4800, training MLE loss is 2.716551969051361, train CL loss is 0.007949690273962915
At training steps 4850, training MLE loss is 2.7174752163887024, train CL loss is 0.007993523911572993
At training steps 4900, training MLE loss is 2.715788841644923, train CL loss is 0.008005436526921888
At training steps 4950, training MLE loss is 2.716864178776741, train CL loss is 0.007994086053222417


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5000, training MLE loss is 2.7152415375709533, train CL loss is 0.007951709297019988


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 5000, training MLE loss is 2.7152415375709533, train CL loss is 0.007951709297019988, validation loss is 2.69948491169477
Saving model...
Model Saved!
-----------------------------------
At training steps 5050, training MLE loss is 2.715855984687805, train CL loss is 0.007893372485414147
At training steps 5100, training MLE loss is 2.715512656569481, train CL loss is 0.007827768211718648
At training steps 5150, training MLE loss is 2.7127215405305227, train CL loss is 0.0077963351652336615
At training steps 5200, training MLE loss is 2.7103149506449697, train CL loss is 0.007753568839980289


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5250, training MLE loss is 2.7122237892150878, train CL loss is 0.007727227103896439


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 5250, training MLE loss is 2.7122237892150878, train CL loss is 0.007727227103896439, validation loss is 2.6970879419098535
Saving model...
Model Saved!
-----------------------------------
At training steps 5300, training MLE loss is 2.708297129869461, train CL loss is 0.0076117058959789576
At training steps 5350, training MLE loss is 2.6995433455705644, train CL loss is 0.007644788202596829
At training steps 5400, training MLE loss is 2.699229146639506, train CL loss is 0.007619513922836631
At training steps 5450, training MLE loss is 2.695963822901249, train CL loss is 0.0075550370698329065


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5500, training MLE loss is 2.6976191775798797, train CL loss is 0.007527238368522376


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 5500, training MLE loss is 2.6976191775798797, train CL loss is 0.007527238368522376, validation loss is 2.6941646569690954
Saving model...
Model Saved!
-----------------------------------
At training steps 5550, training MLE loss is 2.7106817746162415, train CL loss is 0.007523693835828453
At training steps 5600, training MLE loss is 2.711736099123955, train CL loss is 0.00761734014027752
At training steps 5650, training MLE loss is 2.7101295936107634, train CL loss is 0.007561608258790026
At training steps 5700, training MLE loss is 2.708779515326023, train CL loss is 0.007563677813159302


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5750, training MLE loss is 2.7100850930213927, train CL loss is 0.007557152172550559


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 5750, training MLE loss is 2.7100850930213927, train CL loss is 0.007557152172550559, validation loss is 2.692822618264701
Saving model...
Model Saved!
-----------------------------------
At training steps 5800, training MLE loss is 2.7016984832286837, train CL loss is 0.0073121576523408295
At training steps 5850, training MLE loss is 2.6935718947649003, train CL loss is 0.007454829892376438
At training steps 5900, training MLE loss is 2.6981422503789267, train CL loss is 0.007408060695355137
At training steps 5950, training MLE loss is 2.691614710688591, train CL loss is 0.0074066453881096094


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6000, training MLE loss is 2.6914072597026824, train CL loss is 0.0073820413085632025


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6000, training MLE loss is 2.6914072597026824, train CL loss is 0.0073820413085632025, validation loss is 2.6895911284787175
Saving model...
Model Saved!
-----------------------------------
At training steps 6050, training MLE loss is 2.7059673738479613, train CL loss is 0.007228508542757481
At training steps 6100, training MLE loss is 2.7020372474193572, train CL loss is 0.007241977404337376
At training steps 6150, training MLE loss is 2.6970879077911376, train CL loss is 0.007254367634498824
At training steps 6200, training MLE loss is 2.696754969358444, train CL loss is 0.007246535188169218


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6250, training MLE loss is 2.694045554637909, train CL loss is 0.007220228898338974


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6250, training MLE loss is 2.694045554637909, train CL loss is 0.007220228898338974, validation loss is 2.6883660596066226
Saving model...
Model Saved!
-----------------------------------
At training steps 6300, training MLE loss is 2.6984809505939484, train CL loss is 0.007306446109432727
At training steps 6350, training MLE loss is 2.6923750042915344, train CL loss is 0.007230684086680412
At training steps 6400, training MLE loss is 2.6910815262794494, train CL loss is 0.007168489331379532
At training steps 6450, training MLE loss is 2.691974640786648, train CL loss is 0.007152304312912747


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6500, training MLE loss is 2.693642969608307, train CL loss is 0.007137359621934593


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6500, training MLE loss is 2.693642969608307, train CL loss is 0.007137359621934593, validation loss is 2.6874240158819007
Saving model...
Model Saved!
-----------------------------------
At training steps 6550, training MLE loss is 2.6743551540374755, train CL loss is 0.007008451763540507
At training steps 6600, training MLE loss is 2.674326540231705, train CL loss is 0.007008435275638476
At training steps 6650, training MLE loss is 2.679766837358475, train CL loss is 0.007061101321596652
At training steps 6700, training MLE loss is 2.6806391644477845, train CL loss is 0.007070142364827916


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6750, training MLE loss is 2.6804315302371977, train CL loss is 0.0070734163932502266


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6750, training MLE loss is 2.6804315302371977, train CL loss is 0.0070734163932502266, validation loss is 2.6857722811379277
Saving model...
Model Saved!
-----------------------------------
At training steps 6800, training MLE loss is 2.6853172075748444, train CL loss is 0.007078693034127354
At training steps 6850, training MLE loss is 2.6871281909942626, train CL loss is 0.0071321763738524165
At training steps 6900, training MLE loss is 2.6831399778525036, train CL loss is 0.007091397137070695
At training steps 6950, training MLE loss is 2.6779790437221527, train CL loss is 0.00704733079590369


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7000, training MLE loss is 2.6811902334690094, train CL loss is 0.007029988791327923


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 7000, training MLE loss is 2.6811902334690094, train CL loss is 0.007029988791327923, validation loss is 2.6833862317563435
Saving model...
Model Saved!
-----------------------------------
At training steps 7050, training MLE loss is 2.6911846101284027, train CL loss is 0.007057116564828903
At training steps 7100, training MLE loss is 2.6858304315805435, train CL loss is 0.007008454032475128
At training steps 7150, training MLE loss is 2.6837958268324535, train CL loss is 0.0070199937404443824
At training steps 7200, training MLE loss is 2.6797623592615127, train CL loss is 0.007006671588751488


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7250, training MLE loss is 2.6795938377380373, train CL loss is 0.006984746418427676


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 7250, training MLE loss is 2.6795938377380373, train CL loss is 0.006984746418427676, validation loss is 2.681625820892432
Saving model...
Model Saved!
-----------------------------------
At training steps 7300, training MLE loss is 2.686622804403305, train CL loss is 0.0068986060074530545
At training steps 7350, training MLE loss is 2.6835068106651305, train CL loss is 0.006990306100342423
At training steps 7400, training MLE loss is 2.683471364180247, train CL loss is 0.006984954710739354
At training steps 7450, training MLE loss is 2.6832870987057684, train CL loss is 0.006945973748224787


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7500, training MLE loss is 2.6826825110912322, train CL loss is 0.006924067175481469


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 7500, training MLE loss is 2.6826825110912322, train CL loss is 0.006924067175481469, validation loss is 2.6815046929375077
Saving model...
Model Saved!
-----------------------------------
At training steps 7550, training MLE loss is 2.679696134328842, train CL loss is 0.0068370223883539435
At training steps 7600, training MLE loss is 2.678505402803421, train CL loss is 0.0068028295726981014
At training steps 7650, training MLE loss is 2.6779031519095104, train CL loss is 0.0068749887892045084
At training steps 7700, training MLE loss is 2.678090518116951, train CL loss is 0.00686251794337295


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7750, training MLE loss is 2.6753330018520356, train CL loss is 0.006844049559906125


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 7750, training MLE loss is 2.6753330018520356, train CL loss is 0.006844049559906125, validation loss is 2.6803381983083696
Saving model...
Model Saved!
-----------------------------------
At training steps 7800, training MLE loss is 2.6793985772132873, train CL loss is 0.006842949276324362
At training steps 7850, training MLE loss is 2.676744274497032, train CL loss is 0.006871595360571518
At training steps 7900, training MLE loss is 2.6721334167321524, train CL loss is 0.006833014882480105
At training steps 7950, training MLE loss is 2.670359828770161, train CL loss is 0.006812825946835801


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8000, training MLE loss is 2.6726896686553956, train CL loss is 0.006783379358705133


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 8000, training MLE loss is 2.6726896686553956, train CL loss is 0.006783379358705133, validation loss is 2.677836619589841
Saving model...
Model Saved!
-----------------------------------
At training steps 8050, training MLE loss is 2.66305428981781, train CL loss is 0.006752924325410276
At training steps 8100, training MLE loss is 2.6742436987161637, train CL loss is 0.006801505006151274
At training steps 8150, training MLE loss is 2.6822131260236106, train CL loss is 0.006793888483662159
At training steps 8200, training MLE loss is 2.6802688524127007, train CL loss is 0.006774348297622055


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8250, training MLE loss is 2.6818629739284514, train CL loss is 0.006778350699692965


| |                #                                | 156 Elapsed Time: 0:00:08


At training steps 8250, training MLE loss is 2.6818629739284514, train CL loss is 0.006778350699692965, validation loss is 2.675888604726993
Saving model...
Model Saved!
-----------------------------------
At training steps 8300, training MLE loss is 2.6745682525634766, train CL loss is 0.006826460189186037
At training steps 8350, training MLE loss is 2.671733195185661, train CL loss is 0.006803471056045964
At training steps 8400, training MLE loss is 2.672587177356084, train CL loss is 0.006723214100735883
At training steps 8450, training MLE loss is 2.6755725434422493, train CL loss is 0.0067103329644305635


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8500, training MLE loss is 2.6733955483436587, train CL loss is 0.006700434208381921


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 8500, training MLE loss is 2.6733955483436587, train CL loss is 0.006700434208381921, validation loss is 2.6761338728011506
At training steps 8550, training MLE loss is 2.6646924662590026, train CL loss is 0.0067426470876671375
At training steps 8600, training MLE loss is 2.658109104037285, train CL loss is 0.006699946301523596
At training steps 8650, training MLE loss is 2.6641476531823476, train CL loss is 0.006679333227220923
At training steps 8700, training MLE loss is 2.667198656499386, train CL loss is 0.00668180781067349


- | #                                                 | 1 Elapsed Time: 0:00:00

At training steps 8750, training MLE loss is 2.6683516635894775, train CL loss is 0.006659991087857634


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 8750, training MLE loss is 2.6683516635894775, train CL loss is 0.006659991087857634, validation loss is 2.675668699028032
Saving model...
Model Saved!
-----------------------------------
At training steps 8800, training MLE loss is 2.669468712806702, train CL loss is 0.006690655197016895
At training steps 8850, training MLE loss is 2.667125626206398, train CL loss is 0.006718238983303308
At training steps 8900, training MLE loss is 2.665276377995809, train CL loss is 0.006697639558309068
At training steps 8950, training MLE loss is 2.6636004745960236, train CL loss is 0.006668065211270005


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9000, training MLE loss is 2.664371607542038, train CL loss is 0.0066620773035101595


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 9000, training MLE loss is 2.664371607542038, train CL loss is 0.0066620773035101595, validation loss is 2.675294747238678
Saving model...
Model Saved!
-----------------------------------
At training steps 9050, training MLE loss is 2.678348662853241, train CL loss is 0.00663479145616293
At training steps 9100, training MLE loss is 2.670626400709152, train CL loss is 0.006611852102214471
At training steps 9150, training MLE loss is 2.6707731624444326, train CL loss is 0.0066468972992151975
At training steps 9200, training MLE loss is 2.669122249186039, train CL loss is 0.006637310994556174


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9250, training MLE loss is 2.6708753933906557, train CL loss is 0.006639871266670525


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 9250, training MLE loss is 2.6708753933906557, train CL loss is 0.006639871266670525, validation loss is 2.674801402526794
Saving model...
Model Saved!
-----------------------------------
At training steps 9300, training MLE loss is 2.6660111927986145, train CL loss is 0.0066152511839754876
At training steps 9350, training MLE loss is 2.6662564837932585, train CL loss is 0.006605266449041664
At training steps 9400, training MLE loss is 2.6661556990941366, train CL loss is 0.006641763377313812
At training steps 9450, training MLE loss is 2.6661161735653875, train CL loss is 0.006657623973442241


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9500, training MLE loss is 2.6648020462989805, train CL loss is 0.0066555537143722174


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 9500, training MLE loss is 2.6648020462989805, train CL loss is 0.0066555537143722174, validation loss is 2.674671017578537
Saving model...
Model Saved!
-----------------------------------
At training steps 9550, training MLE loss is 2.6626900911331175, train CL loss is 0.0065420783450827
At training steps 9600, training MLE loss is 2.670868725180626, train CL loss is 0.00654439213569276
At training steps 9650, training MLE loss is 2.6693346655368804, train CL loss is 0.00658150012139231
At training steps 9700, training MLE loss is 2.6698755019903184, train CL loss is 0.006601547127356753


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9750, training MLE loss is 2.6700197575092317, train CL loss is 0.006612891255877912


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 9750, training MLE loss is 2.6700197575092317, train CL loss is 0.006612891255877912, validation loss is 2.6741667991385896
Saving model...
Model Saved!
-----------------------------------
At training steps 9800, training MLE loss is 2.665933403968811, train CL loss is 0.006658593823667616
At training steps 9850, training MLE loss is 2.667837498784065, train CL loss is 0.0066038946725893765
At training steps 9900, training MLE loss is 2.667009958823522, train CL loss is 0.006597098892088979
At training steps 9950, training MLE loss is 2.665754587352276, train CL loss is 0.006580807988357264


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 10000, training MLE loss is 2.666631906032562, train CL loss is 0.006587718073045835


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 10000, training MLE loss is 2.666631906032562, train CL loss is 0.006587718073045835, validation loss is 2.674269430006514
Training stage completed!
############################################################


In [8]:
mscoco_args = Args('gpt2', "/kaggle/input/mscoco/mscoco_train.json",
           "/kaggle/input/mscoco/mscoco_val.json", 
           "/kaggle/input/mscoco/mscoco_test.json",
           'True', 0.5, 64, 1, 32, 4, 128, 20000, 100, 500, 2e-5, "/kaggle/working/mscoco")

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
import argparse, os
import random
import numpy as np
import time
import logging
import progressbar

import logging
logging.getLogger('transformers.generation_utils').disabled = True
import argparse
if __name__ == '__main__':
    if torch.cuda.is_available():
        print ('Cuda is available.')
    cuda_available = torch.cuda.is_available()
    multi_gpu_training = False
    if cuda_available:
        if torch.cuda.device_count() > 1:
            multi_gpu_training = True
            print ('Using Multi-GPU training, number of GPU is {}'.format(torch.cuda.device_count()))
        else:
            print ('Using single GPU training.')
    else:
        pass
    
    device = torch.device('cuda')
    model_name = args.model_name

    sos_token, pad_token = r'<-start_of_text->', r'<-pad->'
    add_eos_token_to_data = args.add_eos_token_to_data
    if add_eos_token_to_data == 'True':
        add_eos_token_to_data = True
        print ('Add eos token to data!')
    elif add_eos_token_to_data == 'False':
        add_eos_token_to_data = False
        print ('Do not add eos token to data!')
    else:
        raise Exception('Wrong eos configuration for data!!!')
    print ('Loading data...')
    #from dataclass import Data
    data = Data(model_name, args.train_path, args.dev_path, args.test_path, args.max_len, 
        sos_token, pad_token, add_eos_token_to_data)
    print ('Data loaded.')

    #from trainer import model_training
    print ('############################################################')
    print ('Start Training...')
    #from simctg import SimCTG
    print ('Initializaing SimCTG model...')
    model = SimCTG(model_name, sos_token, pad_token)
    if cuda_available:
        if multi_gpu_training:
            model = nn.DataParallel(model) # multi-gpu training
        else:
            pass
        model = model.to(device)
    else:
        pass
    print ('Model loaded') 
    total_steps, print_every, save_every = args.total_steps, args.print_every, args.save_every
    ckpt_save_path = args.save_path_prefix
    model = model_training(args, data, model, total_steps, print_every, save_every, 
        ckpt_save_path, cuda_available, device)
    print ('Training stage completed!')
    print ('############################################################')

Cuda is available.
Using single GPU training.
Add eos token to data!
Loading data...


/ |#                                             | 145000 Elapsed Time: 0:00:00

Add token to the tokenizer.
Original vocabulary size is 50257
Vocabulary size after extension is 50258
sos token is <-start_of_text->, sos token id is 50257
Add token to the tokenizer.
Original vocabulary size is 50258
Vocabulary size after extension is 50259
pad token is <-pad->, pad token id is 50258
eos token is <|endoftext|>, eos token id is 50256
Processing /kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_train.json


| |      #                                       | 144999 Elapsed Time: 0:00:17
- | #                                               | 612 Elapsed Time: 0:00:00

/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_train.json processed!
Processing /kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_val.json


| |        #                                       | 5069 Elapsed Time: 0:00:00
- | #                                               | 846 Elapsed Time: 0:00:00

/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_val.json processed!
Processing /kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_test.json


| |       #                                        | 4999 Elapsed Time: 0:00:00


/kaggle/input/flickr30k/flickr30k/flickr30k/flickr30k_test.json processed!
train number:145000, dev number:5070, test number:5000
Data loaded.
############################################################
Start Training...
Initializaing SimCTG model...
Add token to the tokenizer.
Original vocabulary size is 50257
Vocabulary size after extension is 50258
sos token is <-start_of_text->, sos token id is 50257
Add token to the tokenizer.
Original vocabulary size is 50258
Vocabulary size after extension is 50259
pad token is <-pad->, pad token id is 50258
eos token is <|endoftext|>, eos token id is 50256
Resizing model embedding...
Model embedding resized!
Model loaded
total training steps is 10000, warmup steps is 1000
--------------------------------------------------------------------------
Start Training:
At training steps 50, training MLE loss is 86.50591785430908, train CL loss is 0.4446336743235588
At training steps 100, training MLE loss is 75.01354355812073, train CL loss is 0.43811

- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 250, training MLE loss is 34.94899475622177, train CL loss is 0.29817717902362345


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 250, training MLE loss is 34.94899475622177, train CL loss is 0.29817717902362345, validation loss is 3.4792544365583056
Saving model...
Model Saved!
-----------------------------------
At training steps 300, training MLE loss is 3.5857187795639036, train CL loss is 0.1916647234559059
At training steps 350, training MLE loss is 3.5094057750701904, train CL loss is 0.17686646677553652
At training steps 400, training MLE loss is 3.4472061494986215, train CL loss is 0.1619758374368151
At training steps 450, training MLE loss is 3.403219172358513, train CL loss is 0.14641440546140075


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 500, training MLE loss is 3.3667803025245666, train CL loss is 0.13067736952379347


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 500, training MLE loss is 3.3667803025245666, train CL loss is 0.13067736952379347, validation loss is 3.0542421535590325
Saving model...
Model Saved!
-----------------------------------
At training steps 550, training MLE loss is 3.1744389116764067, train CL loss is 0.0515899771079421
At training steps 600, training MLE loss is 3.155550929903984, train CL loss is 0.045895846262574196
At training steps 650, training MLE loss is 3.144946635961533, train CL loss is 0.04265876375138759
At training steps 700, training MLE loss is 3.132600277066231, train CL loss is 0.04016605405369773


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 750, training MLE loss is 3.1126807606220246, train CL loss is 0.0382527774181217


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 750, training MLE loss is 3.1126807606220246, train CL loss is 0.0382527774181217, validation loss is 2.925877484746651
Saving model...
Model Saved!
-----------------------------------
At training steps 800, training MLE loss is 3.028925575017929, train CL loss is 0.02690190196968615
At training steps 850, training MLE loss is 3.0285982376337053, train CL loss is 0.026746195470914245
At training steps 900, training MLE loss is 3.0255375719070434, train CL loss is 0.025978145375847816
At training steps 950, training MLE loss is 3.017245715856552, train CL loss is 0.025209908813703805


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1000, training MLE loss is 3.0078969316482542, train CL loss is 0.024717320766299962


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 1000, training MLE loss is 3.0078969316482542, train CL loss is 0.024717320766299962, validation loss is 2.8660834319329602
Saving model...
Model Saved!
-----------------------------------
At training steps 1050, training MLE loss is 2.961170206069946, train CL loss is 0.021612802827730773
At training steps 1100, training MLE loss is 2.958592136502266, train CL loss is 0.02094020222313702
At training steps 1150, training MLE loss is 2.9552122008800508, train CL loss is 0.020452892795826
At training steps 1200, training MLE loss is 2.950014791786671, train CL loss is 0.020040335555095226


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1250, training MLE loss is 2.944648115634918, train CL loss is 0.01964418153744191


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 1250, training MLE loss is 2.944648115634918, train CL loss is 0.01964418153744191, validation loss is 2.8276752540126053
Saving model...
Model Saved!
-----------------------------------
At training steps 1300, training MLE loss is 2.916049538850784, train CL loss is 0.017631593863479792
At training steps 1350, training MLE loss is 2.911668943762779, train CL loss is 0.01733106910251081
At training steps 1400, training MLE loss is 2.903133350610733, train CL loss is 0.017097536966515083
At training steps 1450, training MLE loss is 2.89981445223093, train CL loss is 0.016782381524099037


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1500, training MLE loss is 2.8967087478637694, train CL loss is 0.01651110456418246


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 1500, training MLE loss is 2.8967087478637694, train CL loss is 0.01651110456418246, validation loss is 2.803403418298374
Saving model...
Model Saved!
-----------------------------------
At training steps 1550, training MLE loss is 2.8782851922512056, train CL loss is 0.01554420059081167
At training steps 1600, training MLE loss is 2.8751578229665755, train CL loss is 0.015282964839134366
At training steps 1650, training MLE loss is 2.8675374873479207, train CL loss is 0.015063471253961324
At training steps 1700, training MLE loss is 2.8621054020524026, train CL loss is 0.0148641369282268


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 1750, training MLE loss is 2.860811891555786, train CL loss is 0.014784401901066304


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 1750, training MLE loss is 2.860811891555786, train CL loss is 0.014784401901066304, validation loss is 2.7839818905881235
Saving model...
Model Saved!
-----------------------------------
At training steps 1800, training MLE loss is 2.8624999356269836, train CL loss is 0.014060816918499768
At training steps 1850, training MLE loss is 2.859174150824547, train CL loss is 0.01381904661655426
At training steps 1900, training MLE loss is 2.8536959556738535, train CL loss is 0.013694726627630492
At training steps 1950, training MLE loss is 2.850071816146374, train CL loss is 0.01355558654293418


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2000, training MLE loss is 2.8461762821674346, train CL loss is 0.01345369351003319


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 2000, training MLE loss is 2.8461762821674346, train CL loss is 0.01345369351003319, validation loss is 2.7708700681108693
Saving model...
Model Saved!
-----------------------------------
At training steps 2050, training MLE loss is 2.8289731442928314, train CL loss is 0.01269469125661999
At training steps 2100, training MLE loss is 2.8154546666145324, train CL loss is 0.012545929686166347
At training steps 2150, training MLE loss is 2.818215719461441, train CL loss is 0.012539339436528584
At training steps 2200, training MLE loss is 2.8187610748410226, train CL loss is 0.012405321571277455


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2250, training MLE loss is 2.8151691653728483, train CL loss is 0.012260988851077854


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 2250, training MLE loss is 2.8151691653728483, train CL loss is 0.012260988851077854, validation loss is 2.7597519201754794
Saving model...
Model Saved!
-----------------------------------
At training steps 2300, training MLE loss is 2.816501452922821, train CL loss is 0.01173126363195479
At training steps 2350, training MLE loss is 2.8119233506917953, train CL loss is 0.011630208005663008
At training steps 2400, training MLE loss is 2.8118707211812337, train CL loss is 0.011585213906752566
At training steps 2450, training MLE loss is 2.806650701165199, train CL loss is 0.0114550953253638


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2500, training MLE loss is 2.8070769486427305, train CL loss is 0.011360606632195413


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 2500, training MLE loss is 2.8070769486427305, train CL loss is 0.011360606632195413, validation loss is 2.747862123020917
Saving model...
Model Saved!
-----------------------------------
At training steps 2550, training MLE loss is 2.779461292028427, train CL loss is 0.011100975787267089
At training steps 2600, training MLE loss is 2.7957763344049456, train CL loss is 0.010922284710686654
At training steps 2650, training MLE loss is 2.7873291170597074, train CL loss is 0.01088967251436164
At training steps 2700, training MLE loss is 2.78712953299284, train CL loss is 0.01086031535291113


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 2750, training MLE loss is 2.7880057775974274, train CL loss is 0.010806973490398377


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 2750, training MLE loss is 2.7880057775974274, train CL loss is 0.010806973490398377, validation loss is 2.7365111580416093
Saving model...
Model Saved!
-----------------------------------
At training steps 2800, training MLE loss is 2.7852603936195375, train CL loss is 0.010284245694056154
At training steps 2850, training MLE loss is 2.794443464875221, train CL loss is 0.010310928664403037
At training steps 2900, training MLE loss is 2.7906944823265074, train CL loss is 0.010203764395943532
At training steps 2950, training MLE loss is 2.78956950455904, train CL loss is 0.010124954935745336


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3000, training MLE loss is 2.7881384513378142, train CL loss is 0.01006608795421198


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 3000, training MLE loss is 2.7881384513378142, train CL loss is 0.01006608795421198, validation loss is 2.7315297013854094
Saving model...
Model Saved!
-----------------------------------
At training steps 3050, training MLE loss is 2.7666773653030394, train CL loss is 0.009936242839321494
At training steps 3100, training MLE loss is 2.7633174031972887, train CL loss is 0.009789401242742315
At training steps 3150, training MLE loss is 2.764438907702764, train CL loss is 0.009759445186549177
At training steps 3200, training MLE loss is 2.762897124886513, train CL loss is 0.00974587119766511


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3250, training MLE loss is 2.763734133243561, train CL loss is 0.009713584505021572


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3250, training MLE loss is 2.763734133243561, train CL loss is 0.009713584505021572, validation loss is 2.726212669667498
Saving model...
Model Saved!
-----------------------------------
At training steps 3300, training MLE loss is 2.755035563707352, train CL loss is 0.009461078885942697
At training steps 3350, training MLE loss is 2.752248782515526, train CL loss is 0.009370006383396684
At training steps 3400, training MLE loss is 2.7520154122511546, train CL loss is 0.009373502614131819
At training steps 3450, training MLE loss is 2.7551573753356933, train CL loss is 0.009365768587449565


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3500, training MLE loss is 2.7558072390556334, train CL loss is 0.009327884730882943


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3500, training MLE loss is 2.7558072390556334, train CL loss is 0.009327884730882943, validation loss is 2.7215985175986237
Saving model...
Model Saved!
-----------------------------------
At training steps 3550, training MLE loss is 2.753508977890015, train CL loss is 0.009181281405035406
At training steps 3600, training MLE loss is 2.7565709829330443, train CL loss is 0.00918081490905024
At training steps 3650, training MLE loss is 2.754525634845098, train CL loss is 0.009176083499720942
At training steps 3700, training MLE loss is 2.7531027778983117, train CL loss is 0.009101171916699968


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 3750, training MLE loss is 2.7522094061374665, train CL loss is 0.009067034141160548


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 3750, training MLE loss is 2.7522094061374665, train CL loss is 0.009067034141160548, validation loss is 2.7151759225955887
Saving model...
Model Saved!
-----------------------------------
At training steps 3800, training MLE loss is 2.7482115960121156, train CL loss is 0.008905627585481852
At training steps 3850, training MLE loss is 2.7467210656404495, train CL loss is 0.00896376089542173
At training steps 3900, training MLE loss is 2.7510768469174702, train CL loss is 0.008912856007615726
At training steps 3950, training MLE loss is 2.748846605718136, train CL loss is 0.008795350362197496


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4000, training MLE loss is 2.7438323862552645, train CL loss is 0.008747163327410818


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 4000, training MLE loss is 2.7438323862552645, train CL loss is 0.008747163327410818, validation loss is 2.7114607474640584
Saving model...
Model Saved!
-----------------------------------
At training steps 4050, training MLE loss is 2.746242731809616, train CL loss is 0.008390166303142906
At training steps 4100, training MLE loss is 2.7347631359100344, train CL loss is 0.008494691347004845
At training steps 4150, training MLE loss is 2.7359437771638233, train CL loss is 0.008525221251572172
At training steps 4200, training MLE loss is 2.7392246437072756, train CL loss is 0.008496823219466022


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4250, training MLE loss is 2.7374220113754273, train CL loss is 0.008500004099681973


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 4250, training MLE loss is 2.7374220113754273, train CL loss is 0.008500004099681973, validation loss is 2.7061526170417443
Saving model...
Model Saved!
-----------------------------------
At training steps 4300, training MLE loss is 2.7243010222911836, train CL loss is 0.00828922511311248
At training steps 4350, training MLE loss is 2.7240605717897415, train CL loss is 0.008328378529986366
At training steps 4400, training MLE loss is 2.7249705334504446, train CL loss is 0.008295924078362684
At training steps 4450, training MLE loss is 2.727466923594475, train CL loss is 0.008340855198912322


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4500, training MLE loss is 2.7279994122982023, train CL loss is 0.008336686646100133


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 4500, training MLE loss is 2.7279994122982023, train CL loss is 0.008336686646100133, validation loss is 2.7030625452717914
Saving model...
Model Saved!
-----------------------------------
At training steps 4550, training MLE loss is 2.72907369017601, train CL loss is 0.008102100456599147
At training steps 4600, training MLE loss is 2.724970841407776, train CL loss is 0.008095431260298937
At training steps 4650, training MLE loss is 2.7248940404256183, train CL loss is 0.008066758643835784
At training steps 4700, training MLE loss is 2.7195638337731363, train CL loss is 0.008034166496945544


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 4750, training MLE loss is 2.7205360260009765, train CL loss is 0.0080276961424388


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 4750, training MLE loss is 2.7205360260009765, train CL loss is 0.0080276961424388, validation loss is 2.7011788601953515
Saving model...
Model Saved!
-----------------------------------
At training steps 4800, training MLE loss is 2.7310378730297087, train CL loss is 0.007933434240985662
At training steps 4850, training MLE loss is 2.7308504515886307, train CL loss is 0.007920664568664505
At training steps 4900, training MLE loss is 2.726236433585485, train CL loss is 0.007930505027373632
At training steps 4950, training MLE loss is 2.7234598964452745, train CL loss is 0.00789984397822991


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5000, training MLE loss is 2.720565677165985, train CL loss is 0.007912985518109053


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 5000, training MLE loss is 2.720565677165985, train CL loss is 0.007912985518109053, validation loss is 2.6979236070644905
Saving model...
Model Saved!
-----------------------------------
At training steps 5050, training MLE loss is 2.7233039700984953, train CL loss is 0.007857356497552245
At training steps 5100, training MLE loss is 2.7164350187778474, train CL loss is 0.007786153551423922
At training steps 5150, training MLE loss is 2.709970299402873, train CL loss is 0.0077495279310581585
At training steps 5200, training MLE loss is 2.715164274573326, train CL loss is 0.007717715105391107


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5250, training MLE loss is 2.7109481780529023, train CL loss is 0.007682757441885769


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 5250, training MLE loss is 2.7109481780529023, train CL loss is 0.007682757441885769, validation loss is 2.696627116984163
Saving model...
Model Saved!
-----------------------------------
At training steps 5300, training MLE loss is 2.7160015153884887, train CL loss is 0.007680256415624171
At training steps 5350, training MLE loss is 2.7108998382091523, train CL loss is 0.007588859628885985
At training steps 5400, training MLE loss is 2.7147406510512035, train CL loss is 0.007616754143188397
At training steps 5450, training MLE loss is 2.717227800190449, train CL loss is 0.0076073903142241765


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5500, training MLE loss is 2.716736418247223, train CL loss is 0.0075903131030499935


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 5500, training MLE loss is 2.716736418247223, train CL loss is 0.0075903131030499935, validation loss is 2.6939708194075793
Saving model...
Model Saved!
-----------------------------------
At training steps 5550, training MLE loss is 2.7168583488464355, train CL loss is 0.007517939535900951
At training steps 5600, training MLE loss is 2.7089650893211363, train CL loss is 0.007528452008264139
At training steps 5650, training MLE loss is 2.7060008605321246, train CL loss is 0.007516814654227346
At training steps 5700, training MLE loss is 2.7038873633742333, train CL loss is 0.0074979206558782605


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 5750, training MLE loss is 2.701066177368164, train CL loss is 0.007484365808777511


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 5750, training MLE loss is 2.701066177368164, train CL loss is 0.007484365808777511, validation loss is 2.6913580465188516
Saving model...
Model Saved!
-----------------------------------
At training steps 5800, training MLE loss is 2.6962998032569887, train CL loss is 0.007456042733974755
At training steps 5850, training MLE loss is 2.699984254837036, train CL loss is 0.007408129593823105
At training steps 5900, training MLE loss is 2.698524075349172, train CL loss is 0.007360100579292824
At training steps 5950, training MLE loss is 2.697005010843277, train CL loss is 0.0073403760086512195


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6000, training MLE loss is 2.697796677827835, train CL loss is 0.007318161675706506


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6000, training MLE loss is 2.697796677827835, train CL loss is 0.007318161675706506, validation loss is 2.690002708581925
Saving model...
Model Saved!
-----------------------------------
At training steps 6050, training MLE loss is 2.7084562945365906, train CL loss is 0.007310065622441471
At training steps 6100, training MLE loss is 2.6991971349716186, train CL loss is 0.007285958213033155
At training steps 6150, training MLE loss is 2.6923939259847005, train CL loss is 0.007301216734728466
At training steps 6200, training MLE loss is 2.6883799889683724, train CL loss is 0.0072897862328682094


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6250, training MLE loss is 2.690589182138443, train CL loss is 0.007271432145033032


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6250, training MLE loss is 2.690589182138443, train CL loss is 0.007271432145033032, validation loss is 2.6882229958850052
Saving model...
Model Saved!
-----------------------------------
At training steps 6300, training MLE loss is 2.698529943227768, train CL loss is 0.0070693565532565115
At training steps 6350, training MLE loss is 2.6940562123060228, train CL loss is 0.007080130157992244
At training steps 6400, training MLE loss is 2.6864104489485423, train CL loss is 0.007081187727550666
At training steps 6450, training MLE loss is 2.686900497972965, train CL loss is 0.00707276746223215


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6500, training MLE loss is 2.686302544593811, train CL loss is 0.007060623284894973


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6500, training MLE loss is 2.686302544593811, train CL loss is 0.007060623284894973, validation loss is 2.686125824568512
Saving model...
Model Saved!
-----------------------------------
At training steps 6550, training MLE loss is 2.682853181362152, train CL loss is 0.007050684948917478
At training steps 6600, training MLE loss is 2.682671811580658, train CL loss is 0.0070226785179693255
At training steps 6650, training MLE loss is 2.681231427192688, train CL loss is 0.0070113495543288685
At training steps 6700, training MLE loss is 2.6826431468129157, train CL loss is 0.00699934852833394


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 6750, training MLE loss is 2.6820868053436278, train CL loss is 0.006981830241624266


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 6750, training MLE loss is 2.6820868053436278, train CL loss is 0.006981830241624266, validation loss is 2.6843514497518655
Saving model...
Model Saved!
-----------------------------------
At training steps 6800, training MLE loss is 2.6880186462402342, train CL loss is 0.0070018356060609225
At training steps 6850, training MLE loss is 2.6907732135057447, train CL loss is 0.00698815580108203
At training steps 6900, training MLE loss is 2.687016803026199, train CL loss is 0.006974850055606415
At training steps 6950, training MLE loss is 2.68310869961977, train CL loss is 0.006988698901841417


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7000, training MLE loss is 2.684938769340515, train CL loss is 0.007025787631049752


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 7000, training MLE loss is 2.684938769340515, train CL loss is 0.007025787631049752, validation loss is 2.6825598243997857
Saving model...
Model Saved!
-----------------------------------
At training steps 7050, training MLE loss is 2.6792464804649354, train CL loss is 0.007073643887415528
At training steps 7100, training MLE loss is 2.678948495388031, train CL loss is 0.006961327136959881
At training steps 7150, training MLE loss is 2.6853137771288553, train CL loss is 0.00695730972647046
At training steps 7200, training MLE loss is 2.6830999937653544, train CL loss is 0.0069397627376019955


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7250, training MLE loss is 2.682803137779236, train CL loss is 0.006931454594247043


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 7250, training MLE loss is 2.682803137779236, train CL loss is 0.006931454594247043, validation loss is 2.6804154161021
Saving model...
Model Saved!
-----------------------------------
At training steps 7300, training MLE loss is 2.691908274888992, train CL loss is 0.006833030369598418
At training steps 7350, training MLE loss is 2.6842998921871186, train CL loss is 0.006877129827626049
At training steps 7400, training MLE loss is 2.682914686203003, train CL loss is 0.006862838057180245
At training steps 7450, training MLE loss is 2.684588363468647, train CL loss is 0.006913781462353654


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7500, training MLE loss is 2.685162354707718, train CL loss is 0.006879777234978974


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 7500, training MLE loss is 2.685162354707718, train CL loss is 0.006879777234978974, validation loss is 2.67916139005439
Saving model...
Model Saved!
-----------------------------------
At training steps 7550, training MLE loss is 2.6916236090660095, train CL loss is 0.006728229899890721
At training steps 7600, training MLE loss is 2.6864327430725097, train CL loss is 0.0067985605145804585
At training steps 7650, training MLE loss is 2.685591219266256, train CL loss is 0.006809285698303332
At training steps 7700, training MLE loss is 2.6828581312298776, train CL loss is 0.006806636044057086


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 7750, training MLE loss is 2.6837402241230013, train CL loss is 0.006803704244084656


| |                  #                              | 156 Elapsed Time: 0:00:07


At training steps 7750, training MLE loss is 2.6837402241230013, train CL loss is 0.006803704244084656, validation loss is 2.6780675792799715
Saving model...
Model Saved!
-----------------------------------
At training steps 7800, training MLE loss is 2.6692636132240297, train CL loss is 0.006890996091533453
At training steps 7850, training MLE loss is 2.678474935889244, train CL loss is 0.006838605671655387
At training steps 7900, training MLE loss is 2.6713732139269513, train CL loss is 0.006789532968929659
At training steps 7950, training MLE loss is 2.669508013129234, train CL loss is 0.006786242424859665


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8000, training MLE loss is 2.6686993906497953, train CL loss is 0.006761864523403347


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 8000, training MLE loss is 2.6686993906497953, train CL loss is 0.006761864523403347, validation loss is 2.676811147041697
Saving model...
Model Saved!
-----------------------------------
At training steps 8050, training MLE loss is 2.6702385890483855, train CL loss is 0.00660444414941594
At training steps 8100, training MLE loss is 2.6692964392900467, train CL loss is 0.006650283854687586
At training steps 8150, training MLE loss is 2.670385706027349, train CL loss is 0.0066380651597864925
At training steps 8200, training MLE loss is 2.6686617103219032, train CL loss is 0.006690542867290788


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8250, training MLE loss is 2.668591616868973, train CL loss is 0.006703532222658396


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 8250, training MLE loss is 2.668591616868973, train CL loss is 0.006703532222658396, validation loss is 2.676940350622814
At training steps 8300, training MLE loss is 2.676745672225952, train CL loss is 0.006726080882363022
At training steps 8350, training MLE loss is 2.6757458204030993, train CL loss is 0.006748063814593479
At training steps 8400, training MLE loss is 2.6736981507142383, train CL loss is 0.00667883758743604
At training steps 8450, training MLE loss is 2.673828758597374, train CL loss is 0.00669326092407573


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8500, training MLE loss is 2.673412171602249, train CL loss is 0.006692436147015542


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 8500, training MLE loss is 2.673412171602249, train CL loss is 0.006692436147015542, validation loss is 2.6761603924011066
Saving model...
Model Saved!
-----------------------------------
At training steps 8550, training MLE loss is 2.677997080087662, train CL loss is 0.006720739623997361
At training steps 8600, training MLE loss is 2.681458113193512, train CL loss is 0.006668533170595765
At training steps 8650, training MLE loss is 2.678089912732442, train CL loss is 0.006693617573473603
At training steps 8700, training MLE loss is 2.6732740843296052, train CL loss is 0.006677523668040522


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 8750, training MLE loss is 2.6736988108158113, train CL loss is 0.00666866759583354


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 8750, training MLE loss is 2.6736988108158113, train CL loss is 0.00666866759583354, validation loss is 2.675207535304272
Saving model...
Model Saved!
-----------------------------------
At training steps 8800, training MLE loss is 2.6688389098644256, train CL loss is 0.006682146030943841
At training steps 8850, training MLE loss is 2.6710133254528046, train CL loss is 0.006636038521537557
At training steps 8900, training MLE loss is 2.6695152103900908, train CL loss is 0.006620200797139357
At training steps 8950, training MLE loss is 2.670042479932308, train CL loss is 0.006621497492305934


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9000, training MLE loss is 2.66966073346138, train CL loss is 0.00662111180415377


| |                 #                               | 156 Elapsed Time: 0:00:08


At training steps 9000, training MLE loss is 2.66966073346138, train CL loss is 0.00662111180415377, validation loss is 2.6741736418938977
Saving model...
Model Saved!
-----------------------------------
At training steps 9050, training MLE loss is 2.6609369349479675, train CL loss is 0.006565530791413039
At training steps 9100, training MLE loss is 2.666455587744713, train CL loss is 0.006559667466208338
At training steps 9150, training MLE loss is 2.6699963327248892, train CL loss is 0.0065782606656042235
At training steps 9200, training MLE loss is 2.6707584619522096, train CL loss is 0.006597354690311477


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9250, training MLE loss is 2.672373213529587, train CL loss is 0.006606733935885132


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 9250, training MLE loss is 2.672373213529587, train CL loss is 0.006606733935885132, validation loss is 2.673926485885605
Saving model...
Model Saved!
-----------------------------------
At training steps 9300, training MLE loss is 2.663531507253647, train CL loss is 0.006547241010703146
At training steps 9350, training MLE loss is 2.6617275673151015, train CL loss is 0.006509475170169025
At training steps 9400, training MLE loss is 2.6589776253700257, train CL loss is 0.0065508868902300794
At training steps 9450, training MLE loss is 2.660111123919487, train CL loss is 0.006566842932370491


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9500, training MLE loss is 2.662328051328659, train CL loss is 0.006554687559138983


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 9500, training MLE loss is 2.662328051328659, train CL loss is 0.006554687559138983, validation loss is 2.673934584044686
At training steps 9550, training MLE loss is 2.669171677827835, train CL loss is 0.006595007593277842
At training steps 9600, training MLE loss is 2.6686159670352936, train CL loss is 0.006546075842343271
At training steps 9650, training MLE loss is 2.6740362191200258, train CL loss is 0.006550703609827906
At training steps 9700, training MLE loss is 2.670478744506836, train CL loss is 0.0065314780722837895


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 9750, training MLE loss is 2.6683958072662355, train CL loss is 0.006521219823509455


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 9750, training MLE loss is 2.6683958072662355, train CL loss is 0.006521219823509455, validation loss is 2.6733436984833796
Saving model...
Model Saved!
-----------------------------------
At training steps 9800, training MLE loss is 2.659423379898071, train CL loss is 0.006480978152249009
At training steps 9850, training MLE loss is 2.6616420328617094, train CL loss is 0.006529177543707192
At training steps 9900, training MLE loss is 2.6636938563982646, train CL loss is 0.006521945266673962
At training steps 9950, training MLE loss is 2.6669489607214927, train CL loss is 0.006549172606319189


- | #                                                 | 2 Elapsed Time: 0:00:00

At training steps 10000, training MLE loss is 2.6677763950824738, train CL loss is 0.006544685707893223


| |                   #                             | 156 Elapsed Time: 0:00:07


At training steps 10000, training MLE loss is 2.6677763950824738, train CL loss is 0.006544685707893223, validation loss is 2.6732797284184375
Saving model...
Model Saved!
-----------------------------------
Training stage completed!
############################################################
