In [1]:
# -*- coding: utf-8 -*-

#
# Pytorchで用いるDatasetの定義
#

#!pip install janome

# sysモジュールをインポート
import sys

import matplotlib.pyplot as plt
import pandas as pd
import torch
import random
from torch import nn, Tensor
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import numpy as np
import math
import janome
from janome.tokenizer import Tokenizer
from collections import Counter
from torch.utils.data.sampler import SubsetRandomSampler
import time
import levenshtein
import pickle
from timm.scheduler import CosineLRScheduler
from nltk import bleu_score
#from tqdm import tqdm
from tqdm.notebook import tqdm
from torch import autocast, GradScaler
from typing import Sequence, Dict, Tuple, Union
from transformers import  get_linear_schedule_with_warmup
from transformers import  CLIPVisionModel, get_linear_schedule_with_warmup, BertTokenizer

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device( "cpu")
use_amp = False

In [2]:
path = "../../python_image_recognition-main/6_img_captioning/6_7_ImgCap_pre_trained_Feature_extractor/CLIP_ENCODER/models--google-bert--bert-large-uncased/snapshots/6da4b6a26a1877e173fca3225479512db81a5e5b"
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path = path)

vocab_size = len(tokenizer)
print(vocab_size)

30522


In [3]:
'''
BERT tokenizers utilize specific special tokens for various purposes, primarily to structure input sequences for the BERT model. These tokens are typically added automatically when processing text with a BERT tokenizer, especially when using methods like encode_plus or __call__ from the Hugging Face Transformers library.
The most common special tokens in a BERT tokenizer are:
[CLS] (Classifier Token):
This token is placed at the beginning of the input sequence. Its corresponding final hidden state is used as the aggregate representation of the entire sequence for classification tasks.
[SEP] (Separator Token):
This token is used to separate different segments within an input sequence, such as separating a question from an answer in question-answering tasks, or to mark the end of a single sentence.
[PAD] (Padding Token):
This token is used to pad sequences to a uniform length, ensuring that all sequences in a batch have the same dimensions for efficient processing by the model.
[UNK] (Unknown Token):
This token represents words or subwords that are not found in the tokenizer's vocabulary.
[MASK] (Mask Token):
This token is used during the pre-training phase of BERT for the Masked Language Model (MLM) objective, where a percentage of input tokens are randomly replaced with [MASK], and the model predicts the original token.
When you tokenize text and request the return of special tokens, these tokens will be included in the output token IDs and can be viewed by decoding the token IDs back to strings. For example, if you tokenize a sentence like "Hello world," the output might look something like ['[CLS]', 'Hello', 'world', '[SEP]'] when decoded.
'''

'\nBERT tokenizers utilize specific special tokens for various purposes, primarily to structure input sequences for the BERT model. These tokens are typically added automatically when processing text with a BERT tokenizer, especially when using methods like encode_plus or __call__ from the Hugging Face Transformers library.\nThe most common special tokens in a BERT tokenizer are:\n[CLS] (Classifier Token):\nThis token is placed at the beginning of the input sequence. Its corresponding final hidden state is used as the aggregate representation of the entire sequence for classification tasks.\n[SEP] (Separator Token):\nThis token is used to separate different segments within an input sequence, such as separating a question from an answer in question-answering tasks, or to mark the end of a single sentence.\n[PAD] (Padding Token):\nThis token is used to pad sequences to a uniform length, ensuring that all sequences in a batch have the same dimensions for efficient processing by the model.

In [3]:
class SequenceDataset(Dataset):
    ''' ミニバッチデータを作成するクラス
        torch.utils.data.Datasetクラスを継承し，
        以下の関数を定義する
        __len__: 総サンプル数を出力する関数
        __getitem__: 1サンプルのデータを出力する関数
    feat_scp:  特徴量リストファイル
    label_scp: ラベルファイル
    feat_mean: 特徴量の平均値ベクトル
    feat_std:  特徴量の次元毎の標準偏差を並べたベクトル 
    pad_index: バッチ化の際にフレーム数を合わせる
               ためにpaddingする整数値
    splice:    前後(splice)フレームを特徴量を結合する
               splice=1とすると，前後1フレーム分結合
               するので次元数は3倍になる．
               splice=0の場合は何もしない
    '''
    def __init__(self, 
                 filename,
                 tokenizer
                 ):

        with open(filename, mode='r', encoding="utf-8") as file_f:
            for n, line in enumerate( file_f ):
                pass
        
        file_max = n
        
        # 特徴量リスト，ラベルを1行ずつ
        # 読み込みながら情報を取得する
        self.input_ids = []
        self.input_lens = []
        self.target_ids = []
        self.target_lens = []
        self.num_data = 0
        with open(filename, mode='r', encoding="utf-8") as file_f:
            for n, line in enumerate( file_f ):
                if n % 100000 == 0:
                    print( "n:", n )
                if n > file_max // 50:
                    break
                en1 = torch.tensor( tokenizer.encode(line), dtype=torch.int, requires_grad = False  )
                #print( "en1:", en1 )
                self.input_ids.append( en1 )
                self.input_lens.append( len( en1 ) )
                self.target_ids.append( en1 )
                self.target_lens.append( len( en1 ) )

                self.num_data += 1

    def __len__(self):
        ''' 学習データの総サンプル数を返す関数
        本実装では発話単位でバッチを作成するため，
        総サンプル数=発話数である．
        '''
        #return self.num_data // 100
        return self.num_data


    def __getitem__(self, idx):
        ''' サンプルデータを返す関数
        本実装では発話単位でバッチを作成するため，
        idx=発話番号である．
        '''
        # 特徴量系列のフレーム数
        input_len = self.input_lens[idx]
        # ラベルの長さ
        target_len = self.target_lens[idx]

        # ラベル
        input = self.input_ids[idx]

        # 発話ID
        target = self.target_ids[idx]

        # 特徴量，ラベル，フレーム数，
        # ラベル長，発話IDを返す
        return (input, target, input_len, target_len )

In [4]:
def collate_func(batch: Sequence[Tuple[Union[torch.Tensor, str]]], pad_index ):
    inputs0, targets0, input_lens, target_lens = zip(*batch)

    inputs = []
    targets = []
    for input1, target1, input_len, target_len in zip( inputs0, targets0, input_lens, target_lens ):
        pad_len = max( input_lens ) - input_len 
        input2= F.pad( input1, (0, pad_len), mode='constant', value = pad_index)
        inputs.append( input2 )
        pad_len = max( target_lens ) - target_len
        target2= F.pad( target1, (0, pad_len), mode='constant', value = pad_index)
        targets.append( target2 )

    
    inputs = torch.stack( inputs, dim = 0 )
    targets = torch.stack( targets, dim = 0 )
    input_lens = torch.tensor( input_lens )
    target_lens = torch.tensor( target_lens )
    
    return inputs, targets, input_lens, target_lens


In [5]:
collate_func_lambda = lambda x: collate_func(x, tokenizer.pad_token_id)

In [6]:
train_dataset = SequenceDataset( "train.txt", tokenizer )
print( "train dataset defiened, len( train_dataset):", len( train_dataset) )

# 訓練データのDataLoaderを呼び出す
# 訓練データはシャッフルして用いる
#  (num_workerは大きい程処理が速くなりますが，
#   PCに負担が出ます．PCのスペックに応じて
#   設定してください)


n: 0
train dataset defiened, len( train_dataset): 35439


In [7]:
val_dataset = SequenceDataset( "val.txt", tokenizer  )
print( "val dataset defiened, len( val_dataset):", len( val_dataset ) )

n: 0
val dataset defiened, len( val_dataset): 3938


In [8]:
batch_size = 16

num_workers = 0 if device == torch.device( 'cpu' ) else 8

train_loader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=num_workers,
                            pin_memory=True,
                            collate_fn = collate_func_lambda)
print( "train_loader defiend" )
# 開発データのDataLoaderを呼び出す
# 開発データはデータはシャッフルしない
val_loader = DataLoader(val_dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=num_workers,
                        collate_fn = collate_func_lambda)
print( "val_loader defined" )
print( len( train_loader ))
print( len( val_loader ))

text, target, text_len, target_len = next(iter(train_loader))
#print(text[0], target[0], text_len[0],target_len[0], sep="\n")
print( "text:", text )
#print( "text_len:", text_len )

train_loader defiend
val_loader defined
2215
247
text: tensor([[  101,  2172,  2038,  ...,     0,     0,     0],
        [  101,  1045,  2052,  ...,     0,     0,     0],
        [  101, 21658,  2343,  ...,     0,     0,     0],
        ...,
        [  101,  2009,  2003,  ...,     0,     0,     0],
        [  101,  2057,  5993,  ...,     0,     0,     0],
        [  101,  1996,  4895,  ...,     0,     0,     0]], dtype=torch.int32)


In [9]:
class PositionalEmbedding(nn.Module):
    '''
    位置埋め込み （Positional embedding）
    dim_embedding: 埋込み次元
    max_len      : 入力の最大系列長
    '''
    def __init__(self, dim_embedding: int, max_len: int=2048):
        super().__init__()

        self.pos_emb = nn.Embedding(max_len, dim_embedding)

    '''
    位置エンコーディングの順伝播
    x: 位置エンコーディングを埋め込む対象のテンソル,
       [バッチサイズ, 系列長, 埋め込み次元]
    '''
    def forward(self, x: torch.Tensor):
        seq = x.shape[1]
        positions = torch.arange(start=0, end=seq, step=1, device=x.device).to(torch.long)
        positions = self.pos_emb(positions)[:seq,:]
        
        return positions

In [10]:
class TransformerEncoder(nn.Module):
    '''
    CaptioningTransformerのコンストラクタ
    dim_embedding  : 埋め込み次元
    dim_feedforward: FNNの中間特徴次元
    num_heads      : マルチヘッドアテンションのヘッド数
    num_layers     : Transformerデコーダ層の数
    vocab_size     : 辞書の次元
    pad_index      : PADのID
    dropout        : ドロップアウト確率
    '''
    def __init__(self, vocab_size: int, dim_embedding: int, dim_feedforward: int,
                 num_heads: int, num_layers: int ):
        super().__init__()

        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=num_heads, batch_first=True, activation='gelu', norm_first=True)
            for _ in range(num_layers)
        ])

    ''' CaptioningTransformerの順伝播処理
    features: 画像特徴量 [バッチサイズ, 埋め込み次元]
    captions: 正解キャプション [バッチサイズ, 系列長]

    '''
    def forward(self, src: torch.Tensor, src_mask: torch.Tensor=None, \
                src_key_padding_mask: torch.Tensor=None ):


        for layer in self.encoder_layers:
            src = layer( src, src_mask = src_mask, src_key_padding_mask = src_key_padding_mask,\
                        is_causal= True )
                        #is_causal= False )

        return src

In [11]:
class Transformer(nn.Module):
    def __init__(self, dim_embedding: int, dim_feedforward: int,
                 num_heads: int, num_layers: int, vocab_size: int,
                 pad_index: int, dropout: float=0.5, mask_prob: float=0.15 ):
        super().__init__()

        # 単語埋め込み
        self.embed = nn.Embedding(
            vocab_size, dim_embedding, padding_idx=pad_index)

        # 位置エンコーディング
        self.pos_emb = PositionalEmbedding(dim_embedding)        
        
        # dropout
        self.dropout = nn.Dropout( dropout )
        
        self.encoder = TransformerEncoder(vocab_size, dim_embedding, dim_feedforward, num_heads, num_layers)

        # 単語出力分布計算
        self.ln = nn.LayerNorm( dim_embedding )
        self.linear = nn.Linear(dim_embedding, vocab_size)

        self.pad_index = pad_index
        self.num_heads = num_heads
        self.mask_prob = mask_prob
        
    ''' CaptioningTransformerの順伝播処理
    features: 画像特徴量 [バッチサイズ, 埋め込み次元]
    captions: 正解キャプション [バッチサイズ, 系列長]

    '''
    def forward(self, text):

        device = text.device

        src = self.embed( text )
        
        src += self.pos_emb( src )
        src = self.dropout( src )
        src_key_padding_mask = torch.eq(text, self.pad_index)

        #ones = torch.ones( text.size(1) ).to( device = device )
        #src_mask = torch.diag( ones ).bool()
        #src_mask = None

        batch, num_seq = text.size()
        q_num_seq, k_num_seq = num_seq, num_seq
        num_heads = self.num_heads

        zero_one = [0,1]

        src_mask = torch.tensor( [], device = device, dtype=torch.bool)

        for i in range( batch ):
            pad_length = torch.sum( torch.eq( text[i], self.pad_index))
            k_zero_ones = torch.zeros( k_num_seq, device = device, dtype= torch.bool )
            k_zero_ones[1:-1-pad_length] = torch.rand( (text[i].shape[0] - 2 - pad_length), device = device ) < self.mask_prob
            q_k_zero_ones = k_zero_ones[None,:].expand( q_num_seq, k_num_seq )
            src_mask = torch.cat( [src_mask, q_k_zero_ones[None]], dim = 0 )

        src_mask = src_mask[:,None,:,:].expand( -1, num_heads, -1, -1 )
        src_mask = src_mask.reshape( batch*num_heads, num_seq, num_seq )         
        
        preds = self.encoder( src, src_mask, src_key_padding_mask )
        
        preds = self.ln( preds )
        logits = self.linear( preds )

        return logits

In [12]:
print( device )

cpu


In [12]:
epoch_num = 10
model = Transformer(768, 3072, 12, 6, vocab_size, tokenizer.pad_token_id ).to(device)

input_texts = torch.randint( 0, vocab_size, size=(16, 45))
outputs_logits = model( input_texts )

#criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id )
criterion = nn.CrossEntropyLoss( )
#criterion = nn.CTCLoss(blank=idx_list_en['<blank>'], reduction='mean', zero_infinity = True )
#optimizer = optim.Adam(model.parameters(), lr=0.0001 )
#optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay = 0.005 )
#optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay = 0.001 )
#optimizer = optim.Adadelta(model.parameters(), lr = 1.0, weight_decay = 0.0 )
#optimizer = optim.Adadelta(model.parameters(),lr=1.0,rho=0.95,eps=1e-8,weight_decay=0.0)
#optimizer = optim.AdamW( model.parameters(), lr = 1e-4 )
lr = 1e-4
#lr = 1e-3
optimizer = optim.AdamW( model.parameters(), lr = lr )
# 全ステップ数
num_global_steps = len( train_loader ) * epoch_num
print( "num_global_steps:", num_global_steps )
num_warmup_steps = num_global_steps * 0.1
print( "num_warmup_steps:", num_warmup_steps )
#スケジューラーの定義
scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps, num_global_steps )
eps = 1e-4

num_global_steps: 22150
num_warmup_steps: 2215.0


In [13]:
# WarmupとCosine Decayを行うスケジューラを利用
#scheduler = CosineLRScheduler(
#    optimizer, t_initial=epoch_num, lr_min=1e-1,
#    warmup_t=5, warmup_lr_init=5e-2, warmup_prefix=True)

#tr_print_coef = 64000
#tr_save_coef = 1000
#val_print_coef = 240
len_tr_loader = len( train_loader )
len_val_loader = len( val_loader )
tr_print_coef = len_tr_loader // 10
tr_save_coef = len_tr_loader // 100
#tr_print_coef = len_val_loader // 10
#tr_save_coef = len_val_loader // 10
val_print_coef = len_val_loader // 3
print( "len( train_loader ):", len_tr_loader )
print( "len( val_loader ):", len_val_loader )
print( "tr_print_coef:", tr_print_coef )
print( "tr_save_coef:", tr_save_coef )
print( "val_print_coef:", val_print_coef )
#tr_print_coef = 1
#tr_save_coef = 1
#val_print_coef = 1
#train_length = len(train_loader)
#train_int = train_length // tr_print_coef
#print( train_int )
#print( train_length )
#val_length = len(val_loader)
#val_int = val_length // val_print_coef
#print( val_int )
#print( val_length )

history = {"len_tr_loader":[],"len_val_loader":[], "train_loss":[], "val_loss": [], "train_wer": [], "val_wer": [], "train_bleu": [], "val_bleu": [] }
history["len_tr_loader"].append( len_tr_loader )
history["len_val_loader"].append( len_val_loader )
with open("Diag_Mask3.pkl", "wb") as f:
    pickle.dump( history, f )      
n = 0
train_loss = 0
val_loss = 0


# 学習率の減衰やEarly stoppingの
# 判定を開始するエポック数
# (= 最低限このエポックまではどれだけ
# validation結果が悪くても学習を続ける)
lr_decay_start_epoch = 5

# 学習率を減衰する割合
# (減衰後学習率 <- 現在の学習率*lr_decay_factor)
# 1.0以上なら，減衰させない
lr_decay_factor = 0.5

# Early stoppingの閾値
# 最低損失値を更新しない場合が
# 何エポック続けば学習を打ち切るか
early_stop_threshold = 3

# 最も低い損失値，
# そのときのモデルとエポック数を記憶しておく
best_loss = -1
best_model = None
best_epoch = 0

# Early stoppingフラグ．Trueになると学習を打ち切る
early_stop_flag = False
# Early stopping判定用(損失値の最低値が
# 更新されないエポックが何回続いているか)のカウンタ
counter_for_early_stop = 0

fn = bleu_score.SmoothingFunction().method7

# AMP用のスケーラー
scaler = GradScaler(enabled=use_amp)

for epoch in range(epoch_num):
    # early stopフラグが立っている場合は，
    # 学習を打ち切る
    if early_stop_flag:
        print('    Early stopping.'\
            ' (early_stop_threshold = %d)' \
            % (early_stop_threshold))
        #log_file.write('\n    Early stopping.'\
        #        ' (early_stop_threshold = %d)' \
        #        % (early_stop_threshold))
        break
    
    with tqdm(train_loader) as pbar:
    #with tqdm(val_loader) as pbar:
        pbar.set_description(f'[Train エポック {epoch + 1}]')
    
        model.train()
        #scheduler.step(epoch)
        #print( "Train")
        train_loss = 0
        mean_error = 0
        mean_bleu = 0
        n3 = 0
        for i, ( text, target, text_len, target_len ) in enumerate( pbar ):
            optimizer.zero_grad()
            text = text.to(device)
            target = target.to(device).long()
            
            with autocast(str(device),enabled=use_amp):
                outputs = model( text )
                # 損失の計算
                loss = criterion(outputs.transpose(1,2) + eps, target)

            preds = torch.argmax( outputs, dim = 2 )
            #print( "preds:",preds[0])
            #print( "text:",text[0])
            #print( "target:", target[0] )
                
            # 誤差逆伝播
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            clip_grad_threshold = 5.0
            torch.nn.utils.clip_grad_norm_(\
                    model.parameters(),
                    clip_grad_threshold)
            # オプティマイザにより，パラメータを更新する
            scaler.step(optimizer)
            scaler.update()            
            
            scheduler.step()
            
            #lr = optimizer.param_groups[0]['lr']
            #print( "lr:", lr )

            total_error = 0
            total_token_length = 0
            total_bleu = 0
            n2 = 0       
            hypo_sentence = []
            ref_sentence = []
            for hypo_id, caption in zip( preds, target ):
                hypo = tokenizer.decode( hypo_id, skip_special_tokens = True )
                #print( "hypo_id:", hypo_id )
                hypo_list = tokenizer.tokenize( hypo )
                reference = tokenizer.decode( caption, skip_special_tokens = True )
                ref_list =  tokenizer.tokenize( reference )
                #print( "hypo:", hypo )
                #print( "hypo_list:", hypo_list )
                #print( "reference:", reference )
                #print( "ref_list:", ref_list )
                
                # 認識誤りを計算
                #print( "hypo_list:", hypo_list)
                #print( "ref_list:", ref_list )
                (error, substitute, 
                    delete, insert, ref_length) = \
                    levenshtein.calculate_error(hypo_list,ref_list)

                #print( "error:", error, "ref_length:", ref_length )
                bleu = bleu_score.sentence_bleu( [reference], hypo, smoothing_function=fn )
        
                total_bleu += bleu
    
                total_error += error
                total_token_length += ref_length

                #if n2 < 2 and ( i % tr_print_coef == tr_print_coef -1  or  i == len( train_loader ) -1 ) :
                if n2 < 2 and ( i % tr_print_coef == tr_print_coef -1 ) :
                    hypo_sentence.append( hypo )
                    ref_sentence.append( reference )
                    
                n2 += 1
                
            train_loss += loss.item()
            history["train_loss"].append( loss.item() )
            history["train_wer"].append( total_error/total_token_length * 100)
            mean_error += total_error/total_token_length * 100
            history["train_bleu"].append( total_bleu/ n2 * 100 )
            mean_bleu += total_bleu/ n2 * 100
            n3 += 1
            if i % tr_save_coef == tr_save_coef - 1:
                with open("Diag_Mask3.pkl", "wb") as f:
                    pickle.dump( history, f )            
            if i % tr_print_coef == tr_print_coef - 1:
                lr = optimizer.param_groups[0]['lr']
                #print(f"Train epoch:{epoch+1}  index:{i+1} loss:{train_loss/n3:.10f} WER:{mean_error / n3:.10f} BLEU:{mean_bleu / n3:.10f } lr:{lr}")
                print(f"Train epoch:{epoch+1}  index:{i+1}  loss:{train_loss/n3:.10f}   WER:{mean_error / n3:.10f} BLEU:{mean_bleu / n3 } lr:{lr:.10f}")
            #if i == len( train_loader ) - 1:
            for (hypo_s, refe_s ) in zip( hypo_sentence, ref_sentence ):
                print( "index:", i+1, "target:", refe_s)
                print( "index:", i+1, "hypo  :", hypo_s )
            pbar.set_postfix({
                    'loss': train_loss / n3,
                    'WER': mean_error / n3,
                    'BLEU': mean_bleu / n3
            })
    
    with tqdm(val_loader) as pbar:
        pbar.set_description(f'[検証]')
        model.eval()
        #for i, ( text, target, text_len, target_len ) in enumerate(val_loader):
        val_loss = 0
        mean_error = 0
        mean_bleu = 0
        n3 = 0
        for i, ( text, target, text_len, target_len ) in enumerate( pbar ):
            text = text.to(device)
            target = target.to(device).long()

            with torch.no_grad():
                outputs = model(text)
                preds = torch.argmax( outputs, dim = 2 )
                loss = criterion( outputs.transpose(1, 2) + eps, target )
           
            total_error = 0
            total_token_length = 0
            total_bleu = 0
            n2 = 0
            hypo_sentence = []
            ref_sentence = []
            for hypo_id, caption in zip( preds, target ):
                #hypo = tokenizer.decode( hypo_id )
                hypo = tokenizer.decode( hypo_id, skip_special_tokens = True )
                #print( "hypo_id:", hypo_id )
                hypo_list = tokenizer.tokenize( hypo )
                #reference = tokenizer.decode( caption )
                reference = tokenizer.decode( caption, skip_special_tokens = True )
                ref_list =  tokenizer.tokenize( reference )
                
                # 認識誤りを計算
                (error, substitute, 
                    delete, insert, ref_length) = \
                    levenshtein.calculate_error(hypo_list,ref_list)

                bleu = bleu_score.sentence_bleu( [reference], hypo, smoothing_function=fn )
                #print( "bleu:", bleu )
                total_bleu += bleu
            
                total_error += error
                total_token_length += ref_length

                if n2 < 2 and ( i % val_print_coef == val_print_coef -1 ) :
                    hypo_sentence.append( hypo )
                    ref_sentence.append( reference )
                    
                n2 += 1
        
            val_loss += loss.item()
            history["val_loss"].append( loss.item() )
            history["val_wer"].append( total_error/total_token_length * 100)
            mean_error += total_error/total_token_length * 100
            history["val_bleu"].append( total_bleu / n2 * 100 )
            #print( "total_bleu / n2 * 100:", total_bleu / n2 * 100 )
            mean_bleu += total_bleu / n2 * 100
            n3 += 1
            #print(" mean_bleu:", mean_bleu)
            #print( "n3:", n3 )
            #print( "mean_bleu/ n3:", mean_bleu / n3)
            if i % val_print_coef == val_print_coef - 1:
                lr = optimizer.param_groups[0]['lr']
                print(f"Val epoch:{epoch+1}  index:{i+1}  loss:{val_loss/n3:.10f}   WER:{mean_error / n3:.10f} BLEU:{mean_bleu / n3 } lr:{lr:.10f}")
                PATH = './Diag_Mask3_curr.pt'
                torch.save({'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss,},
                     PATH)
                with open("Diag_Mask3.pkl", "wb") as f:
                    pickle.dump( history, f )
            for (hypo_s, refe_s ) in zip( hypo_sentence, ref_sentence ):
                print( "index:", i+1, "target:", refe_s)
                print( "index:", i+1, "hypo  :", hypo_s )
            pbar.set_postfix({
                    'loss': val_loss / n3,
                    'WER': mean_error/ n3,
                    'BLEU': mean_bleu / n3
                })

    epoch_loss = val_loss/n3
    if epoch == 0 or best_loss > epoch_loss:
        # 損失値が最低値を更新した場合は，
        # その時のモデルを保存する
        best_loss = epoch_loss
        torch.save(model.state_dict(), 
                    './best_model_Mask3.pt')
        best_epoch = epoch
        # Early stopping判定用の
        # カウンタをリセットする
        counter_for_early_stop = 0
    else:
        # 最低値を更新しておらず，
        if epoch+1 >= lr_decay_start_epoch:
            # かつlr_decay_start_epoch以上の
            # エポックに達している場合
            if counter_for_early_stop+1 \
                    >= early_stop_threshold:
                # 更新していないエポックが，
                # 閾値回数以上続いている場合，
                # Early stopping フラグを立てる
                early_stop_flag = True
            else:
                # Early stopping条件に
                # 達していない場合は
                # 学習率を減衰させて学習続行
                if lr_decay_factor < 1.0:
                    for i, param_group \
                            in enumerate(\
                            optimizer.param_groups):
                        if i == 0:
                            lr = param_group['lr']
                            dlr = lr_decay_factor \
                                * lr
                            print('    (Decay '\
                                'learning rate:'\
                                ' %f -> %f)' \
                                % (lr, dlr))
                        param_group['lr'] = dlr
                # Early stopping判定用の
                # カウンタを増やす
                counter_for_early_stop += 1
#torch.cuda.synchronize()    

len( train_loader ): 2215
len( val_loader ): 247
tr_print_coef: 221
tr_save_coef: 22
val_print_coef: 82


  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:1  index:221  loss:7.2556655126   WER:159.4743630469 BLEU:8.099403539081008 lr:0.0000099774
index: 221 target: whilst many additives are harmless, this does not mean that they all are.
index: 221 hypo  : 
index: 221 target: with regard to the specific point you are making, i recognise that you are perfectly right and that this is an issue which should be included in the agenda for work in this area as soon as possible.
index: 221 hypo  : 
Train epoch:1  index:442  loss:5.3764363481   WER:124.8186858435 BLEU:4.098882796413096 lr:0.0000199549
index: 442 target: as the commissioner said, and i agree with him, it is a matter of political decision.
index: 442 hypo  : the, and, a of.
index: 442 target: in my view, and i say this quite clearly, the question is ideologically motivated rather than solution - focused.
index: 442 hypo  : in, and, the is.
Train epoch:1  index:663  loss:4.3469662138   WER:104.7144767457 BLEU:5.320421706706515 lr:0.0000299323
index: 663 target: after all

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:1  index:82  loss:0.6949095708   WER:18.1116958831 BLEU:77.21325186225981 lr:0.0001000000
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both.
Val epoch:1  index:164  loss:0.6789506986   WER:17.8771287613 BLEU:76.97591404715521 lr:0.0001000000
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the authorities, the has been created within the context of a cooperation agreement between the council of europe and the authorities.
index: 164 target: of course, the

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:2  index:221  loss:0.4876597394   WER:10.9974657518 BLEU:94.77126225667232 lr:0.0000988914
index: 221 target: the ever increasing volume of goods passing through europe entails all kinds of risks, known and unknown, for employees and the social environment.
index: 221 hypo  : the ever increasing volume of goods involved through europe en where all therefore of risks, known and and, for employees and the social environment.
index: 221 target: the insertion of article 13, which was referred to during the debate, dealing with non - discrimination in the treaty establishing the european community, is one of the most significant changes of the last revision of the treaties.
index: 221 hypo  : the put of article 13, which was referred to during the debate, dealing with non - discrimination in the treaty establishing the european community, is one of the most significant changes of the last revision of the treaties.
Train epoch:2  index:442  loss:0.4472051348   WER:9.9313637350 BL

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:2  index:82  loss:0.1649753818   WER:3.7472800820 BLEU:105.40210214594266 lr:0.0000888889
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:2  index:164  loss:0.1579415678   WER:3.6095862691 BLEU:105.50475157756647 lr:0.0000888889
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:3  index:221  loss:0.1446910265   WER:2.7952834259 BLEU:106.59505299515612 lr:0.0000877803
index: 221 target: as mr davies also mentioned, standards are, in actual fact, lower than those in the united states.
index: 221 hypo  : as mr davies also mentioned, standards are, in actual fact, lower than those in the united states.
index: 221 target: perhaps before we criticise anybody outside this place we should take a good look at ourselves and what we do here.
index: 221 hypo  : perhaps before we criticise anybody outside this place we should take a good look at ourselves and what we do here.
Train epoch:3  index:442  loss:0.1372159127   WER:2.6220848286 BLEU:106.95742657152289 lr:0.0000866717
index: 442 target: in sweden and finland, we have taken extensive measures in order to reduce national levels of air pollution.
index: 442 hypo  : in sweden and finland, we have taken extensive measures in order to reduce national levels of air pollution.
index: 442 target: they are poli

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:3  index:82  loss:0.1048497738   WER:2.1116027579 BLEU:107.98959751762855 lr:0.0000777778
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:3  index:164  loss:0.0982564215   WER:2.0426081823 BLEU:108.07443453115833 lr:0.0000777778
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:4  index:221  loss:0.0610091193   WER:1.0713955180 BLEU:109.23269324935929 lr:0.0000766692
index: 221 target: in no case, however, are they programmes for the integrated development of the selected areas and they do not create permanent jobs in the countryside because most of the activities there are not productive in nature.
index: 221 hypo  : in no case, however, are they programmes for the integrated development of the selected areas and they do not create permanent jobs in the countryside because most of the activities there are not productive in nature.
index: 221 target: how negotiations are proceeding there, we still do not know.
index: 221 hypo  : how negotiations are proceeding there, we still do not know.
Train epoch:4  index:442  loss:0.0597789884   WER:1.0443178495 BLEU:109.27682997184759 lr:0.0000755606
index: 442 target: if i can explain procedurally and very briefly, the social affairs committee voted on monday evening at the same time as the committee on ind

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:4  index:82  loss:0.0821428566   WER:1.6152857203 BLEU:108.67903385418896 lr:0.0000666667
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:4  index:164  loss:0.0758322611   WER:1.5080483569 BLEU:108.84104942945403 lr:0.0000666667
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:5  index:221  loss:0.0332222925   WER:0.5738403282 BLEU:110.04102500606957 lr:0.0000655581
index: 221 target: we are therefore voting against the report and strongly urge the abandonment of the idea of harmonising criminal law on such flimsy grounds. instead, the focus should be upon preventing the real problem.
index: 221 hypo  : we are therefore voting against the report and strongly urge the abandonment of the idea of harmonising criminal law on such flim ii grounds. instead, the focus should be upon preventing the real problem.
index: 221 target: given this sad situation, all those in serbia who stand up for the freedom of the press deserve our support.
index: 221 hypo  : given this sad situation, all those in serbia who stand up for the freedom of the press deserve our support.
Train epoch:5  index:442  loss:0.0335845239   WER:0.5737585053 BLEU:110.06602910752089 lr:0.0000644495
index: 442 target: last december, this parliament voted to maintain the financial perspecti

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:5  index:82  loss:0.0759700729   WER:1.5016241950 BLEU:109.01282270481083 lr:0.0000555556
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:5  index:164  loss:0.0697626139   WER:1.4217141320 BLEU:109.15125576598015 lr:0.0000555556
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:6  index:221  loss:0.0208254478   WER:0.3175737915 BLEU:110.70611092281203 lr:0.0000544470
index: 221 target: it is essential that the interests of all parties are taken into account.
index: 221 hypo  : it is essential that the interests of all parties are taken into account.
index: 221 target: this is not the union's only contribution.
index: 221 hypo  : this is not the union's only contribution.
Train epoch:6  index:442  loss:0.0201675649   WER:0.3038861564 BLEU:110.5919911524301 lr:0.0000533383
index: 442 target: this all revolves around the kind of philosophy on man and society people take their inspiration from, and mr belder has presented his in no uncertain terms.
index: 442 hypo  : this all revolves around the kind of philosophy on man and society people take their inspiration from, and mr belder has presented his in no uncertain terms.
index: 442 target: mr president, i voted for the maccormick report on liability for defective products because'it is meet and right

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:6  index:82  loss:0.0600503879   WER:0.8763536679 BLEU:109.84539398864601 lr:0.0000444444
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:6  index:164  loss:0.0548871921   WER:0.8312849356 BLEU:109.97516911492698 lr:0.0000444444
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:7  index:221  loss:0.0130252396   WER:0.1633709076 BLEU:110.59363838293062 lr:0.0000433358
index: 221 target: in addition, eur 1. 4 m from the food security budget line have been made available locally to the delegation and the government of mozambique.
index: 221 hypo  : in addition, eur 1. 4 m from the food security budget line have been made available locally to the delegation and the government of mozambique.
index: 221 target: . mr president, we had an extremely useful debate on human rights yesterday.
index: 221 hypo  : . mr president, we had an extremely useful debate on human rights yesterday.
Train epoch:7  index:442  loss:0.0130346360   WER:0.1636774859 BLEU:110.65355821140268 lr:0.0000422272
index: 442 target: human rights must be integrated into the eu's activities naturally : in aid, in trade, in economic, cultural and political cooperation.
index: 442 hypo  : human rights must be integrated into the eu's activities naturally : in aid, in trade, in economic, cu

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:7  index:82  loss:0.0753399308   WER:1.3345016097 BLEU:109.13545313626764 lr:0.0000333333
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:7  index:164  loss:0.0701540054   WER:1.2905761430 BLEU:109.24838273775264 lr:0.0000333333
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:8  index:221  loss:0.0099952199   WER:0.0751489690 BLEU:110.96194201650549 lr:0.0000322247
index: 221 target: clearly, we must consider the problem which exists here and now and the challenge presented by the existing range of cars.
index: 221 hypo  : clearly, we must consider the problem which exists here and now and the challenge presented by the existing range of cars.
index: 221 target: competition yes, restrictions in state aid where necessary and where possible.
index: 221 hypo  : competition yes, restrictions in state aid where necessary and where possible.
Train epoch:8  index:442  loss:0.0097954957   WER:0.0775136750 BLEU:111.01548263746889 lr:0.0000311161
index: 442 target: if that is not the case, there will have to be some form of sanction to ensure these matters are put right.
index: 442 hypo  : if that is not the case, there will have to be some form of sanction to ensure these matters are put right.
index: 442 target: europe is therefore facing an arduous tas

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:8  index:82  loss:0.0616410464   WER:1.0058207953 BLEU:109.72707645697248 lr:0.0000222222
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:8  index:164  loss:0.0558404587   WER:0.9047310031 BLEU:109.92670190734371 lr:0.0000222222
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorit

  0%|          | 0/2215 [00:00<?, ?it/s]

Train epoch:9  index:221  loss:0.0069285224   WER:0.0209861892 BLEU:110.92274660615797 lr:0.0000211136
index: 221 target: the contempt for human beings, and life generally, that now exists can only feed the worst excesses of irresponsible scientists.
index: 221 hypo  : the contempt for human beings, and life generally, that now exists can only feed the worst excesses of irresponsible scientists.
index: 221 target: stenzel report ( a5 - 0034 / 2000 )
index: 221 hypo  : stenzel report ( a5 - 0034 / 2000 )
Train epoch:9  index:442  loss:0.0067661282   WER:0.0248336634 BLEU:111.06008883301028 lr:0.0000200050
index: 442 target: this issue comes under the general transparency rules of the european institutions which were, in fact, defined in the conclusions of the last european council, in helsinki, and i quote, as " an important element in bringing the union closer to its citizens and improving efficiency ".
index: 442 hypo  : this issue comes under the general transparency rules of the eur

  0%|          | 0/247 [00:00<?, ?it/s]

Val epoch:9  index:82  loss:0.0613966506   WER:1.0303767908 BLEU:109.62668177332573 lr:0.0000111111
index: 82 target: on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 hypo  : on the one hand they want enlargement while, on the other hand, they are not prepared to pay for it.
index: 82 target: they cannot have it both ways.
index: 82 hypo  : they cannot have it both ways.
Val epoch:9  index:164  loss:0.0561122627   WER:0.9368929721 BLEU:109.8287727522919 lr:0.0000111111
index: 164 target: according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authorities.
index: 164 hypo  : according to the information we have received from the luxembourg authorities, the centre has been created within the context of a cooperation agreement between the council of europe and the luxembourg authoriti

NameError: name 'log_file' is not defined

In [None]:
print( torch.__version__ )