### ライブラリの準備

###モジュールのインポートとGoogleドライブのマウント

In [1]:
import os
import glob
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import datetime
#from tqdm import tqdm
from tqdm.notebook import tqdm
import pickle
import random
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from PIL import Image
import skimage.transform
from collections import deque
from typing import Sequence, Dict, Tuple, Union

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import models
import torchvision.transforms as T
import torchvision.datasets as dataset
from torchvision.transforms import v2

from timm.scheduler import CosineLRScheduler
from transformers import  get_linear_schedule_with_warmup

#from transformers import AutoImageProcessor, AutoModel, AutoProcessor, CLIPVisionModel
from transformers import BertTokenizer, BertModel, CLIPVisionModel, BertForPreTraining

import sys

import util
import levenshtein
from nltk import bleu_score
import ssl
from torch.amp import autocast, GradScaler

### 位置エンコーディングの実装

In [2]:
class PositionalEmbedding(nn.Module):
    '''
    位置埋め込み （Positional embedding）
    dim_embedding: 埋込み次元
    max_len      : 入力の最大系列長
    '''
    def __init__(self, dim_embedding: int, max_len: int=2048):
        super().__init__()

        self.pos_emb = nn.Embedding(max_len, dim_embedding)

    '''
    位置エンコーディングの順伝播
    x: 位置エンコーディングを埋め込む対象のテンソル,
       [バッチサイズ, 系列長, 埋め込み次元]
    '''
    def forward(self, x: torch.Tensor):
        seq = x.shape[1]
        positions = torch.arange(start=0, end=seq, step=1, device=x.device).to(torch.long)
        positions = self.pos_emb(positions)[:seq,:]
        
        return positions

### Transformerデコーダの実装

### CaptioningTransformerの実装

In [3]:
class CaptioningTransformer(nn.Module):
    '''
    CaptioningTransformerのコンストラクタ
    dim_embedding  : 埋め込み次元
    dim_feedforward: FNNの中間特徴次元
    num_heads      : マルチヘッドアテンションのヘッド数
    num_layers     : Transformerデコーダ層の数
    vocab_size     : 辞書の次元
    null_index     : NULLのID
    dropout        : ドロップアウト確率
    '''
    def __init__(self, img_size: int, length_max: int, dim_embedding: int,
                  vocab_size: int, tokenizer, dropout: float=0.1, model_id: str=''):
        super().__init__()

        self.mask_token_id = tokenizer.mask_token_id
        self.pad_token_id = tokenizer.pad_token_id
        self.max_idx_en = len( tokenizer )

        #CLIP
        clip_model_id = "openai/clip-vit-large-patch14-336"
        self.clip_model = CLIPVisionModel.from_pretrained(clip_model_id, output_hidden_states = True)
        images = torch.randn( ( 1, 3, img_size, img_size ) )
        memory = self.clip_model( images )
        memory = memory.last_hidden_state
        img_length = memory.size(1)
        clip_dim = memory.size(2)
        self.ln_memory = nn.LayerNorm( dim_embedding )

        self.emb = nn.Embedding( vocab_size, dim_embedding, padding_idx=tokenizer.pad_token_id )
        self.pos_emb = PositionalEmbedding( dim_embedding )

        self.dropout = nn.Dropout( dropout )

        self.dc_linear = nn.Linear( clip_dim * 3, dim_embedding )
        #self.dc_ln = nn.LayerNorm( dim_embedding )

        # Down Sampling
        #img_length = 577
        #length_max = 84
        stride = img_length // length_max
        self.conv1 = nn.Conv1d( dim_embedding, dim_embedding, 1, stride )
        print( "img_length:", img_length )
        print( "text_length_max:", length_max )
        print( "stride:", stride )
        seq_len = self.conv1( memory.transpose(1,2) ).size( 2 )
        
        self.bert = BertModel.from_pretrained( model_id )

        ## 単語出力分布計算
        self.ln_outputs = nn.LayerNorm( dim_embedding )
        self.linear = nn.Linear(dim_embedding, vocab_size)

        self.ln_length = nn.LayerNorm( dim_embedding )
        self.conv_length = nn.Conv1d( seq_len, 1, 1 )
        self.embed_lengths = nn.Embedding(1024, dim_embedding)
        nn.init.normal_(self.embed_lengths.weight, mean=0, std=0.02)
        
        self.dim_embedding = dim_embedding

    ''' CaptioningTransformerの順伝播処理
    features: 画像特徴量 [バッチサイズ, 埋め込み次元]
    captions: 正解キャプション [バッチサイズ, 系列長]
    '''
    def forward(self, images: torch.Tensor, captions: torch.Tensor, caption_lengths: torch.Tensor ):

        self.device = images.device

        masked_captions, mask = self.masking( captions, caption_lengths )
        
        memory = self.clip_model( images )
        memory = self.dense_connector( memory )
        memory = self.dropout( memory )
        memory = self.ln_memory( memory )

        memory = self.conv1( memory.transpose(1,2) ).transpose(1,2)
        
        emb_caption = self.emb( masked_captions ) * math.sqrt(self.dim_embedding)
        emb_caption += self.pos_emb( emb_caption )

        bert_in = torch.cat( [memory, emb_caption], dim = 1 )
        bert_in_padding_masks = (~(torch.eq( masked_captions, self.pad_token_id ))).float()
        bert_in_padding_masks = torch.cat( [torch.ones( memory.shape[:2], device=self.device ), bert_in_padding_masks], dim = 1 )
        
        outputs = self.bert( inputs_embeds = bert_in, attention_mask = bert_in_padding_masks ).last_hidden_state
        outputs = outputs[:,memory.size(1):,:]
        outputs = self.ln_outputs( outputs )
        logits = self.linear( outputs )

        predicted_lengths = self.lengths_predictor( memory )
        
        return logits, mask, predicted_lengths

    def dense_connector(self, memory ):
        tmp1 = torch.tensor([], device = self.device )
        tmp2 = torch.tensor([], device = self.device )
        tmp_full = len( memory.hidden_states )
        tmp_half = tmp_full // 2
        for i in range( 0, tmp_half ):
            tmp1 = torch.cat( [tmp1, memory.hidden_states[i][None]], dim = 0 )
        tmp1 = torch.sum(tmp1, dim=0) / tmp_half
        for i in range( tmp_half, tmp_full ):
            tmp2 = torch.cat( [tmp2, memory.hidden_states[i][None]], dim = 0 )
        tmp2 = torch.sum(tmp2, dim=0 ) / ( tmp_full - tmp_half )
        tmp3 = torch.cat([tmp1, tmp2], dim=-1)
        tmp3 = torch.cat( [ memory.last_hidden_state, tmp3], dim = -1 )
        #tmp3 = sel.dc_ln( tmp3 )
        tmp3 = self.dc_linear( tmp3 )
        return tmp3

    def masking(self, input_x: torch.Tensor, lengths: torch.Tensor) -> tuple[torch.Tensor]:

        output = input_x.clone()

        masks = torch.zeros_like( output, device=output.device, dtype=torch.bool )       
        
        #sum_num_mask = 0
        #sum_num_arbi = 0
        #sum_num_nochange = 0
        for n in range( output.size(0) ):
            all_prob = torch.rand( (1) )
            if all_prob > 0.99:
                num_mask = lengths[n]
                num_arbi = 0
                num_nochange = 0
            else:
                mask_prob0 = torch.rand( (1) )
                mask_prob = all_prob * mask_prob0
                resi_prob = all_prob * ( 1.0 - mask_prob0 )
                arbi_prob = all_prob * ( resi_prob * 0.5 )
                nochange_prob = all_prob * ( resi_prob * 0.5 )
                num_mask = math.floor( lengths[n].item() * mask_prob )
                num_arbi = math.floor( lengths[n].item() * arbi_prob )
                num_nochange = math.floor( lengths[n].item() * nochange_prob )

            #sum_num_mask += num_mask
            #sum_num_arbi += num_arbi
            #sum_num_nochange += num_nochange
            
            mask_mask = list( random.sample( list(range( 0, lengths[n])),  num_mask ))
            output[n,mask_mask] = self.mask_token_id
            not_mask_mask = [ n for n in range( lengths[n] ) if n not in mask_mask ]
            mask_arbi = random.sample( not_mask_mask, num_arbi )
            for i in range( lengths[n] ):
                if i in mask_arbi:
                    output[n,i] = torch.randint( 0, self.max_idx_en, size=(1,))
            not_mask_arbi = [ n for n in not_mask_mask if n not in mask_arbi ]
            mask_nochange = random.sample( not_mask_arbi, num_nochange )
            not_mask_nochange = [ n for n in not_mask_arbi if n not in mask_nochange ]
            mask = [ False if n in not_mask_nochange else True for n in range(lengths[n]) ]
            masks[n,:lengths[n]] = torch.tensor( mask )

        #print( "sum_num_mask:", sum_num_mask )
        #print( "calculate num mask:", torch.sum( torch.eq( output, self.mask_token_id ).int() ) )
        #print( "sum_num_mask + sum_num_arbi :", sum_num_mask + sum_num_arbi )
        #print( "num not equal:", torch.sum( torch.ne( input_x, output ).int() ) )
        #print( "sum_num_mask + sum_num_arbi + sum_nochange:", sum_num_mask + sum_num_arbi + sum_num_nochange )
        #print( "num of mask True:", torch.sum( torch.eq( masks, True ) ) )
        
        return output, masks
        
    def lengths_predictor(self, memory):
        
        x = self.ln_length(memory)
        x = self.conv_length( x )
        #print( "size of x[:,0,:]:",x[:,0,:].size())
        #print( "size of self.pos_emb.pos_emb.weight.tranpose(0,1):",self.pos_emb.pos_emb.weight.transpose(0,1).size())
        predicted_lengths_logits = torch.matmul( x[:,0,:], self.embed_lengths.weight.transpose(0,1)).float()
        predicted_lengths_logits [:,0] += float('-inf')
        predicted_lengths = F.log_softmax( predicted_lengths_logits, dim = -1 )
        #predicted_lengths は複数の候補が確率とともに
        
        return predicted_lengths

    def my_decode(self, token_list, tokenizer ):

        def my_index( l, x ):
            if x in l:
                return l.index(x)
            else:
                return -1
        if my_index( token_list, tokenizer.sep_token_id ) != -1:
            token_list = token_list[:my_index( token_list, tokenizer.sep_token_id )]
        else:
            token_list = token_list
            
        text = tokenizer.decode( token_list, skip_special_tokens = True )
        
        return text

In [4]:
class MyDataset(Dataset):
    def __init__(self, file_path: str, img_directory: str, transforms, tokenizer, length_max = None ) -> None:
        super().__init__()
        self.img_directory = img_directory
        self.transforms = transforms
        # TODO: fix to original data
        #画像の前処理
        self.img_file = []
        self.tokens = []
        self.lengths = []
        if length_max == None:
            self.length_max = 0
        else:
            self.length_max = length_max
        length_sum = 0
        with open( file_path, "r" ) as f:
            for i, line in enumerate( f ):
                if i % 100000 == 0:
                    print( "i:", i )
                self.img_file.append(line.split("\t" )[0])
                caption = line.split("\t")[1].replace( "\r\n", "" ).replace( "\n", "").replace( "\r", "" )
                id_tokens = tokenizer.encode( caption )
                length_sum += len( id_tokens )
                if length_max == None:
                    if self.length_max < len( id_tokens ):
                        self.length_max = len( id_tokens )
                    id_tokens = torch.tensor( id_tokens  )
                    self.lengths.append( len( id_tokens ) )
                else:
                    id_tokens = torch.tensor( id_tokens )[:length_max]
                    self.lengths.append( len( id_tokens ) )
                
                self.tokens.append( id_tokens )

                #line = f.readline()
        print("avg len:", length_sum / len( self.tokens ) )    
    
    # ここで取り出すデータを指定している
    def __getitem__(
        self,
        index: int
    ):
        tokens = self.tokens[index]
        img_file = self.img_file[index] + ".jpg"
        img_path = os.path.join( self.img_directory, img_file ) #index番目の画像のパスを取得
        img = Image.open(img_path) #PIL形式で画像を読み込み
        if img.mode != 'RGB':
            img = img.convert("RGB")
        img = self.transforms(img)
        lengths = self.lengths[index]
        
        return img, tokens, lengths

    # この method がないと DataLoader を呼び出す際にエラーを吐かれる
    def __len__(self) -> int:
        return len(self.tokens)

    def length_max(self):
        return self.length_max

In [5]:
def collate_func(batch: Sequence[Tuple[Union[torch.Tensor, str]]], pad_index ):
    imgs, tokens, lengths = zip(*batch)

    lengths = torch.tensor( lengths )
    
    max_length = torch.max( lengths )
    
    targets = []
    for target in tokens:
        pad_len = max_length - len( target ) 
        input2= F.pad( target, (0, pad_len), mode='constant', value = pad_index)
        targets.append( input2 )
    
    imgs = torch.stack( imgs, dim = 0 )
    targets = torch.stack( targets, dim = 0 )
    
    return imgs, targets, lengths

###学習におけるハイパーパラメータやオプションの設定

In [6]:
class ConfigTrain(object):
    '''
    ハイパーパラメータ、システム共通変数の設定
    '''
    def __init__(self):

        # ハイパーパラメータ
        self.img_size = 336
        self.length_max = 84
        self.dim_embedding = 1024   # 埋め込み層の次元
        self.lr_clip = 2e-7
        self.lr_bert = 2e-5            # 学習率
        self.lr_others = 1e-4
        self.weight_decay = 0.01
        self.dropout = 0.1         # dropout確率
        self.batch_size = 20       # ミニバッチ数
        self.num_epochs = 10       # エポック数→Colab無料版でテストする際は10未満に修正を推奨
        self.use_amp = True
        self.use_saved_pth = True
        self.use_saved_pth = False
        self.model_id = "google-bert/bert-large-uncased"
        self.warmup = 0.1
        self.alpha = 0.9
        self.betas = (0.9, 0.999)
        
        # パスの設定
        self.img_directory = '/mnt/ssd2/v7/img'
        self.anno_file = '../CLIP_LLM_AR/dataset.txt'
        self.save_directory = './model'

        # 検証に使う学習セット内のデータの割合
        self.test_ratio = 0.1
        self.val_ratio = 0.1
        
        # 学習に使うデバイス
        #self.device = 'cuda'
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        #self.device = 'cpu'
        
        # データローダーに使うCPUプロセスの数
        self.num_workers = 0 if self.device == torch.device('cpu') else 12
        #self.num_workers = 0
        
        # 移動平均で計算する損失の値の数
        self.moving_avg = 100

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
model_id = "google-bert/bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model_id)
model = CaptioningTransformer(img_size = 336, length_max = 84, dim_embedding=1024, vocab_size=len(tokenizer),
                 tokenizer=tokenizer, dropout=0.1, model_id =model_id).to(device)

images = torch.randn( ( 2, 3, 336,336 ), device = device )
captions = torch.randint( 0, len(tokenizer), size= (2, 50 ), device= device )
caption_lengths = torch.randint( 0, 50, size=(2,), device= device )
logits, mask, predicted_lengths = model( images, captions, caption_lengths )

print( logits.size() )
print( mask.size() )
print( predicted_lengths.size() )

img_length: 577
text_length_max: 84
stride: 6
torch.Size([2, 50, 30522])
torch.Size([2, 50])
torch.Size([2, 1024])


### 学習率スケジューラ

### 学習を行う関数

In [7]:
def calc_length_loss( predicted_lengths, length_target):
    #length_target = target.ne( tokenizer.pad_token_id).sum(-1).unsqueeze(-1)
    length_lprobs = predicted_lengths
    length_loss = -length_lprobs.gather( dim = -1, index=length_target[:,None])
    #length_loss = length_loss.sum()
    length_loss = length_loss.float().mean()
    return length_loss

In [8]:
config = ConfigTrain()

tokenizer = BertTokenizer.from_pretrained(config.model_id)

# 辞書サイズを保存
vocab_size = len( tokenizer )

# モデル出力用のディレクトリを作成
os.makedirs(config.save_directory, exist_ok=True)

# 画像のtransformsを定義
transforms = v2.Compose([
    v2.Resize((336, 336)),
    v2.AutoAugment(),
    #v2.ToTensor(),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    ## Coco データセット 2017 train の平均と標準偏差
    #v2.Normalize((0.456,0.427,0.401),(0.224,0.219,0.231) )
    # ImageNetデータセットの平均と標準偏差
    #v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    # Clip Model の config から引用。
    v2.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])

# v7 データセット
train_dataset = MyDataset( file_path=config.anno_file,
                           img_directory = config.img_directory,
                           transforms=transforms,tokenizer=tokenizer, length_max = config.length_max)

# Subset samplerの生成
test_set, val_set, train_set = util.generate_subset_test_val_train(
    train_dataset, config.test_ratio, config.val_ratio )
    
# 学習時にランダムにサンプルするためのサンプラー
train_sampler = SubsetRandomSampler(train_set)

# DataLoaderを生成
collate_func_lambda = lambda x: collate_func(x, tokenizer.pad_token_id)
train_loader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size=config.batch_size,
                    num_workers=config.num_workers,
                    sampler=train_sampler,
                    collate_fn=collate_func_lambda)
val_loader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size=config.batch_size,
                    num_workers=config.num_workers,
                    sampler=val_set,
                    collate_fn=collate_func_lambda)
test_loader = torch.utils.data.DataLoader(
                    train_dataset,
                    #batch_size=config.batch_size,
                    batch_size=1,
                    num_workers=config.num_workers,
                    sampler=test_set,
                    collate_fn=collate_func_lambda)

print( "config.device:", config.device )
print( "学習セット数:",len( train_loader ) )
print( "評価セット数:",len( val_loader ))
print( "テストセット数:",len( test_loader ))
print( "use_amp:", config.use_amp )
print( "use_saved_pth:", config.use_saved_pth )

# モデルの定義
model = CaptioningTransformer( config.img_size, config.length_max,
    config.dim_embedding, vocab_size,
    tokenizer, config.dropout, config.model_id)
model.to(config.device) 

PATH = "model/model_bert_mask_curr.pth"
print( "exist pth file:", os.path.isfile(PATH) )
use_saved_pth = config.use_saved_pth
if use_saved_pth and os.path.isfile(PATH):
    checkpoint = torch.load(PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    ## optimizerのstateを現在のdeviceに移す。これをしないと、保存前後でdeviceの不整合が起こる可能性がある。
    #for state in optimizer.state.values():
        #for k, v in state.items():
            #if isinstance(v, torch.Tensor):
                #state[k] = v.to(device)
    begin_epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    global_step = checkpoint['global_step']    
else:
    begin_epoch = 0
    global_step = 0

print( "begin_epoch:", begin_epoch )
print( "global_ste:", global_step )

# 損失関数の定義
criterion = nn.CrossEntropyLoss( ignore_index = tokenizer.pad_token_id, reduction = 'mean' )

params_clip = []
params_bert = []
params_others = []
for name, parameter in model.named_parameters():
    if parameter.requires_grad:
        if 'clip_model' in name:
            params_clip.append(parameter)
        elif 'bert' in name:
            params_bert.append(parameter)
        else:
            params_others.append(parameter)
param_groups = [
    {'params': params_clip, 'lr': config.lr_clip},
    {'params': params_bert, 'lr': config.lr_bert},
    {'params': params_others, 'lr': config.lr_others}
]

# 最適化手法の定義
optimizer = torch.optim.AdamW( param_groups, weight_decay = config.weight_decay, betas= config.betas )

# 全ステップ数
print( "epochs:", config.num_epochs )
print( "batch_size:", config.batch_size )
num_global_steps = len( train_loader ) * config.num_epochs
print( "num_global_steps:", num_global_steps )
num_warmup_steps = num_global_steps * config.warmup
print( "num_warmup_steps:", num_warmup_steps )
#スケジューラーの定義
scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps, num_global_steps )    

len_tr_loader = len( train_loader )
train_param = len_tr_loader // 3
len_val_loader = len( val_loader )
#train_param = len_val_loader // 3
val_param = len_val_loader // 3
print( "train_param:", train_param )
print( "val_param:", val_param )

# 学習経過の書き込み
now = datetime.datetime.now()
train_loss_file = '{}/MyOriginal_train_loss_{}.csv'\
    .format(config.save_directory, now.strftime('%Y%m%d_%H%M%S'))
with open(train_loss_file, 'a') as f:
    print(f'{len_tr_loader}', file=f) 
print( "train_loss_file:", train_loss_file )
val_loss_file = '{}/MyOriginal_val_loss_{}.csv'\
    .format(config.save_directory, now.strftime('%Y%m%d_%H%M%S'))
with open(val_loss_file, 'a') as f:
    print(f'{len_val_loader}', file=f) 
norm_file = '{}/norm_{}.csv'\
    .format(config.save_directory, now.strftime('%Y%m%d_%H%M%S'))

print( "lr_clip  :", config.lr_clip)
print( "lr_bert  :", config.lr_bert )
print( "lr_others:", config.lr_others )
print( "weight_decay:", config.weight_decay )
print( "betas:", config.betas )
print( "alpha:", config.alpha )

# 学習
val_loss_best = float('inf')

fn = bleu_score.SmoothingFunction().method7

# AMP用のスケーラー
scaler = GradScaler(enabled=config.use_amp)

for epoch in range(config.num_epochs):
    with tqdm(train_loader) as pbar:
    #with tqdm(val_loader) as pbar:
        pbar.set_description(f'[エポック {epoch + 1}]')

        # 学習モードに設定
        model.train()

        train_losses1 = deque()
        train_losses2 = deque()
        train_losses = deque()
        train_errors = deque()
        train_bleus = deque()
        for n_batch, (imgs, captions, caption_lengths) in enumerate( pbar ):
            # ミニバッチを設定
            imgs = imgs.to(config.device)
            captions = captions.to(config.device)
            caption_lengths = caption_lengths.to(config.device)
                
            optimizer.zero_grad()

            # 最後の単語から次を予測する必要はないため最後の単語を除外
            with autocast(str(config.device),enabled=config.use_amp):
                outputs, mask, predicted_lengths = model( imgs, captions, caption_lengths )

                # 損失の計算
                # 単語軸が第1軸である必要があるため、転置
                loss1 = criterion(outputs[mask], captions[mask])
                loss2 = calc_length_loss( predicted_lengths, caption_lengths )
                loss = config.alpha * loss1 + ( 1 - config.alpha ) * loss2
            
            hypo_ids = torch.argmax( outputs, dim = 2 )
            
            scaler.scale(loss).backward()
            #scaler.unscale_(optimizer)
            #clip_grad_threshold = 5.0
            #torch.nn.utils.clip_grad_norm_(\
            #        model.parameters(),
            #        clip_grad_threshold)
            # オプティマイザにより，パラメータを更新する
            scaler.step(optimizer)
            scaler.update()            
            
            scheduler.step()

            #for name, param in model.named_parameters():
            #    print( name )
            
            norm0 = torch.sqrt( torch.norm( model.clip_model.vision_model.encoder.layers[0].self_attn.q_proj.weight.grad, p = 2 ) ).item()
            norm1 = torch.sqrt( torch.norm( model.bert.encoder.layer[23].attention.self.query.weight.grad, p = 2 ) ).item()
            norm_mean = torch.mean( torch.stack ([ torch.sqrt( torch.norm( param.grad, p = 2 ) ) \
                                                  for param in model.parameters() if param.grad is not None ] ) ).item()
            with open(norm_file, 'a') as f:
                print( "epcoch:", epoch, ", step:", global_step, ", norm0:", norm0, ", norm1:", norm1, ", norm_mean:", norm_mean, file=f  )
                f.flush()
            global_step += 1

            n = 0
            hypo_sentence = []
            ref_sentence = []
            hypo_sentence1 = []
            ref_sentence1 = []
            total_error = 0
            total_token_length = 0
            total_bleu = 0
            n2 = 0
            for (hypo_id, caption) in zip( hypo_ids, captions ):
                hypo = model.my_decode( hypo_id.tolist(), tokenizer )
                hypo_tokens = tokenizer.tokenize( hypo )
                reference = model.my_decode( caption.tolist(), tokenizer )
                ref_tokens = tokenizer.tokenize( reference )
                        
                # 認識誤りを計算
                (error, substitute, 
                    delete, insert, ref_length) = \
                    levenshtein.calculate_error(hypo_tokens,
                                                    ref_tokens)
                
                # 誤り文字数を累積する
                total_error += error
                # 文字の総数を累積する
                total_token_length += ref_length

                bleu = bleu_score.sentence_bleu( [reference], hypo, smoothing_function=fn  )
        
                total_bleu += bleu                    
                    
                if n < 1 and n_batch == len( train_loader ) - 1 :
                    hypo_sentence.append( hypo )
                    ref_sentence.append( reference )
                if n < 1 and n_batch % train_param == 0:
                    hypo_sentence1.append( hypo )
                    ref_sentence1.append( reference )
                    
                n += 1
                n2 += 1
            
            avg_error = total_error / total_token_length * 100
            avg_bleu = total_bleu / n2 * 100
                
            # 学習時の損失をログに書き込み
            train_losses1.append(loss1.item())
            train_losses2.append(loss2.item())
            train_losses.append(loss.item())
            train_errors.append( avg_error )
            train_bleus.append( avg_bleu )
            #train_ciders.append( avg_cider )
            if len(train_losses) > config.moving_avg:
                train_losses1.popleft()
                train_losses2.popleft()
                train_losses.popleft()
                train_errors.popleft()
                train_bleus.popleft()
                #train_ciders.popleft()
            mean_loss1 = torch.Tensor(train_losses1).mean().item()
            mean_loss2 = torch.Tensor(train_losses2).mean().item()
            mean_loss = torch.Tensor(train_losses).mean().item()
            mean_error = torch.Tensor(train_errors).mean().item()
            mean_bleu = torch.Tensor(train_bleus).mean().item()
            pbar.set_postfix({
                'loss1': mean_loss1,
                'loss2': mean_loss2,
                'loss': mean_loss,
                'WER': mean_error,
                'BLEU': mean_bleu,
                #'CIDER': torch.Tensor(train_ciders).mean().item()
            })
            with open(train_loss_file, 'a') as f:
                print(f'{epoch}, {mean_loss1}, {mean_loss2}, {mean_loss}, {mean_error}, {mean_bleu}', file=f)
            print_flag = 1
            for ( hypo_se, ref_se ) in zip( hypo_sentence1, ref_sentence1 ):
                if print_flag == 1:
                    print( "lr clip  :", optimizer.param_groups[0]["lr"] )
                    print( "lr bert  :", optimizer.param_groups[1]["lr"] )
                    print( "lr others:", optimizer.param_groups[2]["lr"] )
                    print_flag = 0
                #print(f'Train epoch = {epoch}, loss = {loss.item()}, WER = {avg_error}, BLEU = {avg_bleu}, CIDER = {avg_cider}')
                print(f'Train epoch = {epoch}, loss1 = {mean_loss1}, loss2 = {mean_loss2}, loss = {mean_loss}, WER = {mean_error}, BLEU = {mean_bleu}')
                print( "refe:", ref_se )
                print( "hypo:", hypo_se )
                    
            for ( hypo_se, ref_se ) in zip( hypo_sentence, ref_sentence ):
                print(f'Train epoch = {epoch}, loss1 = {mean_loss1}, loss2 = {mean_loss2}, loss = {mean_loss}, WER = {mean_error}, BLEU = {mean_bleu}')
                #print(f'Train epoch = {epoch}, loss = {loss.item()}, WER = {avg_error}, BLEU = {avg_bleu}, CIDER = {avg_cider}')
                print( "refe:", ref_se )
                print( "hypo:", hypo_se )
    # 学習率を表示
    print(f'学習率 clip  : {optimizer.param_groups[0]['lr']}')
    print(f'学習率 bert  : {optimizer.param_groups[1]['lr']}')
    print(f'学習率 others: {optimizer.param_groups[2]['lr']}')
    train_loss1 = np.mean(train_losses1)
    train_loss2 = np.mean(train_losses2)
    train_loss = np.mean(train_losses)
    train_error = np.mean(train_errors )
    train_bleu = np.mean(train_bleus )
    print(f'Train loss1: {train_loss1}')
    print(f'Train loss2: {train_loss2}')
    print(f'Train loss: {train_loss}')
    print(f'Train WER: {train_error}')        
    print(f'Train BLEU: {train_bleu}')

    # 検証
    with tqdm(val_loader) as pbar:
        pbar.set_description(f'[検証]')

        # 評価モード
        model.eval()

        #val_losses = []
        val_losses1 = deque()
        val_losses2 = deque()
        val_losses = deque()
        val_errors = deque()
        val_bleus = deque()
        for n_batch, (imgs, captions, caption_lengths) in enumerate( pbar ):

            # ミニバッチを設定
            imgs = imgs.to(config.device)
            captions = captions.to(config.device)
            caption_lengths = caption_lengths.to(config.device)
                
            with torch.no_grad():
                outputs, mask, predicted_lengths = model( imgs, captions, caption_lengths )
                hypo_ids = torch.argmax( outputs, dim = 2 )
                loss1 = criterion(outputs[mask], captions[mask])
                loss2 = calc_length_loss( predicted_lengths, caption_lengths )
                loss = config.alpha * loss1 + ( 1 - config.alpha ) * loss2
            
            n = 0
            hypo_sentence = []
            ref_sentence = []
            hypo_sentence1 = []
            ref_sentence1 = []
            total_error = 0
            total_token_length = 0
            total_bleu = 0
            n2 = 0
            for (hypo_id, caption) in zip( hypo_ids, captions ):
                hypo = model.my_decode( hypo_id.tolist(), tokenizer )
                hypo_tokens = tokenizer.tokenize( hypo )
                reference = model.my_decode( caption.tolist(), tokenizer )
                ref_tokens = tokenizer.tokenize( reference )
                        
                # 認識誤りを計算
                (error, substitute, 
                    delete, insert, ref_length) = \
                    levenshtein.calculate_error(hypo_tokens,
                                                ref_tokens)
                    
                # 誤り文字数を累積する
                total_error += error
                # 文字の総数を累積する
                total_token_length += ref_length

                bleu = bleu_score.sentence_bleu( [reference], hypo, smoothing_function=fn  )
        
                total_bleu += bleu

                if n < 1 and n_batch == len( val_loader ) - 1:
                    hypo_sentence.append( hypo )
                    ref_sentence.append( reference )
                        
                if n < 1 and n_batch % val_param == 0:
                    hypo_sentence1.append( hypo )
                    ref_sentence1.append( reference )
                    
                n += 1
                n2 += 1
                
            avg_error = total_error / total_token_length * 100                    
            avg_bleu = total_bleu / n2 * 100

            # 学習時の損失をログに書き込み
            val_losses1.append(loss1.item())
            val_losses2.append(loss2.item())
            val_losses.append(loss.item())
            val_errors.append( avg_error )
            val_bleus.append( avg_bleu )
            if len(val_losses) > config.moving_avg:
                val_losses1.popleft()
                val_losses2.popleft()
                val_losses.popleft()
                val_errors.popleft()
                val_bleus.popleft()
            mean_loss1 = torch.Tensor(val_losses1).mean().item()
            mean_loss2 = torch.Tensor(val_losses2).mean().item()
            mean_loss = torch.Tensor(val_losses).mean().item()
            mean_error = torch.Tensor(val_errors).mean().item()
            mean_bleu = torch.Tensor(val_bleus).mean().item()
            pbar.set_postfix({
                'loss1': mean_loss1,
                'loss2': mean_loss2,
                'loss': mean_loss,
                'WER': mean_error,
                'BLEU': mean_bleu,
            })
            # Validation Lossをログに書き込み
            with open(val_loss_file, 'a') as f:
                print(f'{epoch}, {mean_loss1}, {mean_loss2}, {mean_loss}, {mean_error}, {mean_bleu}', file=f)

            for ( hypo_se, ref_se ) in zip( hypo_sentence1, ref_sentence1 ):
                print(f'Val epoch = {epoch}, loss1 = {mean_loss1}, loss2 = {mean_loss2}, loss = {mean_loss}, WER = {mean_error}, BLEU = {mean_bleu}')
                print( "refe:", ref_se )
                print( "hypo:", hypo_se )
                    
            for ( hypo_se, ref_se ) in zip( hypo_sentence, ref_sentence ):
                print(f'Val epoch = {epoch}, loss1 = {mean_loss1}, loss2 = {mean_loss2}, loss = {mean_loss}, WER = {mean_error}, BLEU = {mean_bleu}')
                print( "refe:", ref_se )
                print( "hypo:", hypo_se )
                    
    # Loss 表示
    val_loss1 = np.mean(val_losses1)
    val_loss2 = np.mean(val_losses2)
    val_loss = np.mean(val_losses)
    val_error = np.mean( val_errors )
    val_bleu = np.mean( val_bleus )
    print(f'Validation loss1: {val_loss1}')
    print(f'Validation loss2: {val_loss2}')
    print(f'Validation loss: {val_loss}')
    print(f'Validation WER: {val_error}')
    print(f'Validation BLEU: {val_bleu}')

    # より良い検証結果が得られた場合、モデルを保存
    if val_loss < val_loss_best:
        val_loss_best = val_loss

        # モデルを保存
        torch.save({'epoch': epoch,
                    'global_step': global_step,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': loss,},
            f'{config.save_directory}/model_bert_mask_best.pth')
            
    # モデルを保存
    torch.save({'epoch': epoch,
                'global_step': global_step,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': loss,},
        f'{config.save_directory}/model_bert_mask_curr.pth')
        
# モデルを保存
torch.save({'epoch': epoch,
    'global_step': global_step,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'loss': loss,},
    f'{config.save_directory}/model_bert_mask_final.pth')
#f_norm.close()  



i: 0
i: 100000
i: 200000
i: 300000
i: 400000
i: 500000
avg len: 42.0877771734418
config.device: cuda:0
学習セット数: 20298
評価セット数: 2538
テストセット数: 50744
use_amp: True
use_saved_pth: False
img_length: 577
text_length_max: 84
stride: 6
exist pth file: True
begin_epoch: 0
global_ste: 0
epochs: 10
batch_size: 20
num_global_steps: 202980
num_warmup_steps: 20298.0
train_param: 6766
val_param: 846
train_loss_file: ./model/MyOriginal_train_loss_20250912_071056.csv
lr_clip  : 2e-07
lr_bert  : 2e-05
lr_others: 0.0001
weight_decay: 0.01
betas: (0.9, 0.999)
alpha: 0.9


  0%|          | 0/20298 [00:00<?, ?it/s]

2025-09-12 07:11:01.805717: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-12 07:11:01.847592: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757628661.867580  551107 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757628661.873421  551107 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757628661.896992  551107 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

lr clip  : 9.853187506158242e-12
lr bert  : 9.853187506158243e-10
lr others: 4.9265937530791215e-09
Train epoch = 0, loss1 = 10.493376731872559, loss2 = 6.682277202606201, loss = 10.112266540527344, WER = 215.59202575683594, BLEU = 15.343134880065918
refe: there are two girl, girls in the picture, holding a microphone in their hands. both are smiling.
hypo: proportions [unused199] legislatures pouch casketscoe quantity pouch explainse quantity suicide メ quantity free lash suicide usa 1683 proportions comb falcon pouch boer ⁱ [unused375] suicide versatile vendor fascist lesbian 999ya marx pouch rein pouch [unused771] pouch average divinity traveller pouchbon freekei resumed promptly smirk pouch premiered average suicide internment permits waters ortiz slot casketbon pouch suicide suicide devin devin explainsisches suebon threat boer pouch deeply countriessity bothered pouch 1927 pouch pouch pouch average pouchuka
lr clip  : 6.667651985417282e-08
lr bert  : 6.667651985417283e-06
lr other

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 0, loss1 = 2.996779203414917, loss2 = 3.88214111328125, loss = 3.085315227508545, WER = 34.0, BLEU = 85.4687271118164
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see in trees few are green in in in in in in in in in in in in the in i can in in in standing, the road in few vehicles in few in, in in and in sky in in in in in in in in in in in in in in in in in in in
Val epoch = 0, loss1 = 3.0988852977752686, loss2 = 3.921879291534424, loss = 3.181185007095337, WER = 41.089805603027344, BLEU = 76.60462188720703
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see flowers, where we can see few people and some objects.
Val epoch = 0, loss1 = 3.062443733215332, loss2 = 3.8770389556884766, loss = 3.1439034938812256, WER =

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 1.9999890520138821e-07
lr bert  : 1.9999890520138824e-05
lr others: 9.999945260069411e-05
Train epoch = 1, loss1 = 2.7560439109802246, loss2 = 3.82975172996521, loss = 2.8634145259857178, WER = 35.45918273925781, BLEU = 73.46664428710938
refe: in this image we can see two persons standing near the building, a person is wearing uniform and holding a gun and there are leaves on the floor, there is a railing and trees beside the building.
hypo: in this image we can see two persons standing near the building, a person is wearing uniform and holding a in and there are leaves on the floor, there is a railing and trees beside the building.
lr clip  : 1.925914977939808e-07
lr bert  : 1.9259149779398082e-05
lr others: 9.62957488969904e-05
Train epoch = 1, loss1 = 2.919363021850586, loss2 = 3.885070562362671, loss = 3.0159335136413574, WER = 32.449703216552734, BLEU = 78.77717590332031
refe: in this image i can see the ground, a plant which is green in color, a car which is white in c

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 1, loss1 = 3.0301871299743652, loss2 = 3.896893262863159, loss = 3.1168575286865234, WER = 17.375, BLEU = 94.4908447265625
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few sky, few buildings, sky trees and the sky.
Val epoch = 1, loss1 = 2.685990571975708, loss2 = 3.888770341873169, loss = 2.8062682151794434, WER = 27.796985626220703, BLEU = 84.00509643554688
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see trees few where we can see few people and some objects.
Val epoch = 1, loss1 = 2.671024799346924, loss2 = 3.844860792160034, loss = 2.7884082794189453, WER = 26.7859344482421

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 1.7777668297916596e-07
lr bert  : 1.77776682979166e-05
lr others: 8.8888341489583e-05
Train epoch = 2, loss1 = 2.8044729232788086, loss2 = 4.008514404296875, loss = 2.924877166748047, WER = 32.8021240234375, BLEU = 83.9969482421875
refe: in this image we can see a monument. we can also see some trees, wall and the sky.
hypo: in this image we can see a monument. we can also see some trees, can and the sky can
lr clip  : 1.7036927557175857e-07
lr bert  : 1.703692755717586e-05
lr others: 8.51846377858793e-05
Train epoch = 2, loss1 = 2.587308406829834, loss2 = 3.872014045715332, loss = 2.7157790660858154, WER = 27.238988876342773, BLEU = 84.21029663085938
refe: in this image we can see a person. the background of the image is blurred. to the right side of the image there is another person.
hypo: in this image we can see a person. the background of the image is blurred. to the right side of the image there is the person.
lr clip  : 1.6296186816435115e-07
lr bert  : 1.629618681643

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 2, loss1 = 2.3117377758026123, loss2 = 3.830548048019409, loss = 2.463618755340576, WER = 28.375, BLEU = 83.69453430175781
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
Val epoch = 2, loss1 = 2.3777832984924316, loss2 = 3.8754239082336426, loss = 2.5275473594665527, WER = 23.898252487182617, BLEU = 88.36492919921875
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see plants, where we can see few people and we objects.
Val epoch = 2, loss1 = 2.3690338134765625, loss2 = 3.8282482624053955, loss = 2.5149550437927246, WER = 24.83917045

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 1.5555446075694376e-07
lr bert  : 1.5555446075694377e-05
lr others: 7.777723037847188e-05
Train epoch = 3, loss1 = 2.3504271507263184, loss2 = 3.7467422485351562, loss = 2.490058660507202, WER = 17.780580520629883, BLEU = 94.62484741210938
refe: in the picture i can see screws are fixed to some surface, here i can see a box on which i can see some text. they are placed on the white color surface.
hypo: in the picture i can see is are i to some surface, here i can see a box on which i can see some text. i are placed on the white color surface.
lr clip  : 1.4814705334953634e-07
lr bert  : 1.4814705334953637e-05
lr others: 7.407352667476818e-05
Train epoch = 3, loss1 = 2.328052282333374, loss2 = 3.8555941581726074, loss = 2.480806350708008, WER = 24.8011531829834, BLEU = 88.0846939086914
refe: on the left side, there are three children and a woman sitting. on the right side, there is a girl holding a towel, there is grass and a pipe. in the background, there are plants, a wall 

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 3, loss1 = 2.058337688446045, loss2 = 3.7990500926971436, loss = 2.2324090003967285, WER = 22.125, BLEU = 86.36640930175781
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few trees which are green in color, which flowers which are which in color color in the background i can see few,,,,,, few vehicles, few buildings and few and the the sky.
Val epoch = 3, loss1 = 2.0392348766326904, loss2 = 3.8754994869232178, loss = 2.2228610515594482, WER = 21.619733810424805, BLEU = 91.38951110839844
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see plants, where we can see few people people people camera
Val epoch = 3, loss1 = 2.051619052886963, loss2 = 3.8333709239959717, loss = 2.2297940254211426, WER = 20.834482192993164

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 1.3333223853472153e-07
lr bert  : 1.3333223853472155e-05
lr others: 6.666611926736077e-05
Train epoch = 4, loss1 = 2.2062149047851562, loss2 = 3.868011474609375, loss = 2.372394561767578, WER = 17.129629135131836, BLEU = 95.42546844482422
refe: at the bottom of this image, there are buildings having windows, a tower and trees. in the background, there is blue sky.
hypo: at the bottom of this image, there are buildings having windows, a tower and trees. in the background, there is blue sky.
lr clip  : 1.2592483112731414e-07
lr bert  : 1.2592483112731415e-05
lr others: 6.296241556365708e-05
Train epoch = 4, loss1 = 2.1491312980651855, loss2 = 3.8783621788024902, loss = 2.322054386138916, WER = 22.641279220581055, BLEU = 90.88306427001953
refe: in this image we can see trees with flowers and at the bottom there is an object. in the background we can see clouds in the sky.
hypo: in this image we can see trees with flowers and at the bottom there is an object. in the background w

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 4, loss1 = 1.8509347438812256, loss2 = 3.8439412117004395, loss = 2.0502352714538574, WER = 20.875, BLEU = 92.80429077148438
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few plants which are green green color color the which which are green green green in in the background i can see a person standing a, the, few vehicles the few few few few the and the sky.
Val epoch = 4, loss1 = 1.7838599681854248, loss2 = 3.8732187747955322, loss = 1.992795705795288, WER = 18.457826614379883, BLEU = 94.2724380493164
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see planets, where we can see few people and some objects.
Val epoch = 4, loss1 = 1.8204185962677002, loss2 = 3.837341070175171, loss = 2.0221107006073, WER = 19.001

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 1.1111001631249932e-07
lr bert  : 1.1111001631249934e-05
lr others: 5.555500815624967e-05
Train epoch = 5, loss1 = 1.9714245796203613, loss2 = 3.6969246864318848, loss = 2.143974542617798, WER = 24.120603561401367, BLEU = 93.62443542480469
refe: it is a tilted image, there is a club and it is closed there are some other buildings around the club and beside the the wall of the club a car is parked and in the front there is a lot of grass and some flower plants, on the right side there is a tree.
hypo: this is a edited image there there is a building and on the building there are some and on and the. building beside the the there there building there a there is there and there there there there is a there the grass the the flower plants the the the the side the is is tree
lr clip  : 1.037026089050919e-07
lr bert  : 1.0370260890509192e-05
lr others: 5.185130445254596e-05
Train epoch = 5, loss1 = 1.9083539247512817, loss2 = 3.8496744632720947, loss = 2.1024858951568604, WER = 19

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 5, loss1 = 1.3582632541656494, loss2 = 3.915787935256958, loss = 1.6140156984329224, WER = 16.0, BLEU = 97.04225158691406
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few plants which are green color color, few flowers in in in color color and in the background i can see a few standing, few few, few vehicles, few buildings, few trees and the sky.
Val epoch = 5, loss1 = 1.5917391777038574, loss2 = 3.8742291927337646, loss = 1.8199881315231323, WER = 16.461193084716797, BLEU = 96.37561798095703
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see planets, where we can see few people and some objects.
Val epoch = 5, loss1 = 1.6000300645828247, loss2 = 3.832028388977051, loss = 1.8232300281524658, WER = 16.441928863

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 8.888779409027709e-08
lr bert  : 8.88877940902771e-06
lr others: 4.444389704513855e-05
Train epoch = 6, loss1 = 1.6748381853103638, loss2 = 3.7402608394622803, loss = 1.8813804388046265, WER = 14.490674018859863, BLEU = 95.23291778564453
refe: in the given image at the bottom i can see some cars, trees, buildings, pole and in the top i can see a black and white picture.
hypo: in the given image at the bottom i can see some cars, trees, buildings, pole and in the top i can see a black and white picture.
lr clip  : 8.148038668286969e-08
lr bert  : 8.148038668286969e-06
lr others: 4.074019334143485e-05
Train epoch = 6, loss1 = 1.7667077779769897, loss2 = 3.8223254680633545, loss = 1.9722694158554077, WER = 18.148714065551758, BLEU = 95.22734069824219
refe: in the picture there are two empty dining tables, on the tables there are flowers vases and candles, in the background there are two windows in between a brick wall.
hypo: in this image there are chairs around around tables, 

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 6, loss1 = 1.661980390548706, loss2 = 3.87724232673645, loss = 1.8835065364837646, WER = 13.0, BLEU = 101.15412902832031
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few trees which are green in color, few flowers which are green in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
Val epoch = 6, loss1 = 1.4860997200012207, loss2 = 3.868082046508789, loss = 1.724298119544983, WER = 15.082554817199707, BLEU = 97.90394592285156
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see planets, where we can see some people and some objects.
Val epoch = 6, loss1 = 1.4873486757278442, loss2 = 3.8250012397766113, loss = 1.721113681793213, WER = 15.3081045

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 6.666557186805486e-08
lr bert  : 6.666557186805488e-06
lr others: 3.333278593402744e-05
Train epoch = 7, loss1 = 1.1979515552520752, loss2 = 3.6153886318206787, loss = 1.4396952390670776, WER = 9.516837120056152, BLEU = 102.95063781738281
refe: in this picture i can see many birds on the pole and wire. at the bottom i can see roof of the building. at the top i can see the sky.
hypo: in this picture i can see many birds on the pole and wire. at the bottom i can see roof of the the. at the top i can see the sky.
lr clip  : 5.9258164460647466e-08
lr bert  : 5.925816446064747e-06
lr others: 2.9629082230323733e-05
Train epoch = 7, loss1 = 1.6078873872756958, loss2 = 3.8601551055908203, loss = 1.8331141471862793, WER = 17.090187072753906, BLEU = 96.41214752197266
refe: in this image there are two persons in the center the girl is sitting on the chair and is taking her tongue out of the mouth. in the front there is a juice bottle with a name juicy juice at the right side. on the le

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 7, loss1 = 1.1166132688522339, loss2 = 3.8850319385528564, loss = 1.393455147743225, WER = 11.25, BLEU = 100.99256896972656
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few plants which are green in color and few flowers which are in in color and in the background i can see a person standing the few road, few vehicles, few buildings, few trees and the sky.
Val epoch = 7, loss1 = 1.3916603326797485, loss2 = 3.871037483215332, loss = 1.6395981311798096, WER = 14.809781074523926, BLEU = 97.94807434082031
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see a people and we can see few people and a objects.
Val epoch = 7, loss1 = 1.389971137046814, loss2 = 3.825019598007202, loss = 1.63347589969635, WER = 13.81796264

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 4.4443349645832644e-08
lr bert  : 4.444334964583265e-06
lr others: 2.2221674822916323e-05
Train epoch = 8, loss1 = 1.3958228826522827, loss2 = 3.5955910682678223, loss = 1.6157996654510498, WER = 11.587301254272461, BLEU = 98.34548950195312
refe: in this image in the center there are two persons who are standing and one person is holding a phone and talking, in the background there is a grass and one object. on the right side of the image there is some text written.
hypo: in this image in the center there are two persons who are standing and one person is holding a phone and talking, in the background there is a trees and one object. on the right side of the image there some is text written.
lr clip  : 3.703594223842524e-08
lr bert  : 3.7035942238425245e-06
lr others: 1.8517971119212622e-05
Train epoch = 8, loss1 = 1.5323857069015503, loss2 = 3.8438777923583984, loss = 1.7635351419448853, WER = 16.39889144897461, BLEU = 96.65179443359375
refe: this is image is clicked inside

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 8, loss1 = 1.2302350997924805, loss2 = 3.9072036743164062, loss = 1.497931957244873, WER = 13.375, BLEU = 98.37784576416016
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few trees which are green in color and few flowers which are red in color and in the color i can see a person standing, the road few few vehicles, few buildings, few trees and the..
Val epoch = 8, loss1 = 1.3124650716781616, loss2 = 3.874128818511963, loss = 1.5686312913894653, WER = 13.351112365722656, BLEU = 99.21080017089844
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see planets, where we can see few people and some objects.
Val epoch = 8, loss1 = 1.3061429262161255, loss2 = 3.82527232170105, loss = 1.5580557584762573, WER = 13.503962516

  0%|          | 0/20298 [00:00<?, ?it/s]

lr clip  : 2.2221127423610426e-08
lr bert  : 2.222112742361043e-06
lr others: 1.1110563711805213e-05
Train epoch = 9, loss1 = 1.2345689535140991, loss2 = 3.867771863937378, loss = 1.4978891611099243, WER = 10.823529243469238, BLEU = 102.77295684814453
refe: in this image, we can see a man, he is wearing a jacket, the background is not clear.
hypo: in this image, we can see a man, he is wearing a jacket, the background is not clear.
lr clip  : 1.4813720016203019e-08
lr bert  : 1.481372001620302e-06
lr others: 7.40686000810151e-06
Train epoch = 9, loss1 = 1.4247779846191406, loss2 = 3.8220207691192627, loss = 1.6645022630691528, WER = 15.244206428527832, BLEU = 97.60043334960938
refe: the picture is taken on the street of a city. in the center of the picture there are shops, tents, umbrellas, auto rickshaw, motor bike, people and many other objects. in the background there are buildings. in the foreground there are waste papers on the road.
hypo: the picture is taken on the streets of a 

  0%|          | 0/2538 [00:00<?, ?it/s]

Val epoch = 9, loss1 = 1.3455162048339844, loss2 = 3.873495578765869, loss = 1.5983141660690308, WER = 14.875, BLEU = 99.36988067626953
refe: in this image i can see few trees which are green in color, few flowers which are red in color and in the background i can see a person standing, the road, few vehicles, few buildings, few trees and the sky.
hypo: in this image i can see few trees which are green in color color few flowers which are red in color. in the background i can see few persons standing, few, few few vehicles, few buildings and the is the the
Val epoch = 9, loss1 = 1.3162145614624023, loss2 = 3.8691415786743164, loss = 1.571507215499878, WER = 13.602813720703125, BLEU = 98.98859405517578
refe: in this picture we can see planets, where we can see few people and some objects.
hypo: in this picture we can see plants plants here we can see some people and some objects.
Val epoch = 9, loss1 = 1.2897738218307495, loss2 = 3.825502634048462, loss = 1.5433465242385864, WER = 13.24