In [1]:
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
[0m

In [2]:
# Import some useful packages for this homework
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset # "ConcatDataset" and "Subset" are possibly useful
from torchvision.datasets import DatasetFolder, VisionDataset
from torchsummary import summary
from tqdm.auto import tqdm
import random

# !nvidia-smi # list your current GPU

In [3]:
cfg = {
    'dataset_root': '/kaggle/input/ml2023spring-hw13/Food-11',
    'save_dir': '/kaggle/working/outputs',
    'exp_name': "kaggle_test",
    'batch_size': 64,
    'lr': 3e-4,
    'seed': 20220013,
    'loss_fn_type': 'KD', # simple baseline: CE, medium baseline: KD. See the Knowledge_Distillation part for more information.
    'weight_decay': 1e-5,
    'grad_norm_max': 10,
    'n_epochs': 600, # train more steps to pass the medium baseline.
    'patience': 50,
}

In [4]:
myseed = cfg['seed']  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
random.seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

save_path = os.path.join(cfg['save_dir'], cfg['exp_name']) # create saving directory
os.makedirs(save_path, exist_ok=True)

# define simple logging functionality
log_fw = open(f"{save_path}/log.txt", 'w') # open log file to save log outputs
def log(text):     # define a logging function to trace the training process
    print(text)
    log_fw.write(str(text)+'\n')
    log_fw.flush()

log(cfg)  # log your configs to the log file

{'dataset_root': '/kaggle/input/ml2023spring-hw13/Food-11', 'save_dir': '/kaggle/working/outputs', 'exp_name': 'kaggle_test', 'batch_size': 64, 'lr': 0.0003, 'seed': 20220013, 'loss_fn_type': 'KD', 'weight_decay': 1e-05, 'grad_norm_max': 10, 'n_epochs': 600, 'patience': 50}


In [5]:
for dirname, _, filenames in os.walk('/kaggle/input/ml2023spring-hw13/Food-11'):
    if len(filenames) > 0:
        print(f"{dirname}: {len(filenames)} files.") # Show the file amounts in each split.

/kaggle/input/ml2023spring-hw13/Food-11: 1 files.
/kaggle/input/ml2023spring-hw13/Food-11/validation: 4432 files.
/kaggle/input/ml2023spring-hw13/Food-11/training: 9993 files.
/kaggle/input/ml2023spring-hw13/Food-11/evaluation: 2218 files.


In [6]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# define training/testing transforms
test_tfm = transforms.Compose([
    # It is not encouraged to modify this part if you are using the provided teacher model. This transform is stardard and good enough for testing.
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

train_tfm = transforms.Compose([
    # add some useful transform or augmentation here, according to your experience in HW3.
    transforms.Resize(256),  # You can change this
    transforms.CenterCrop(224), # You can change this, but be aware of that the given teacher model's input size is 224.
    # The training input size of the provided teacher model is (3, 224, 224).
    # Thus, Input size other then 224 might hurt the performance. please be careful.
    transforms.RandomGrayscale(p=0.2),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomAffine(30), # You can change this.
    transforms.ToTensor(),
    normalize,
])

In [7]:
class FoodDataset(Dataset):
    def __init__(self, path, tfm=test_tfm, files = None):
        super().__init__()
        self.path = path
        self.files = sorted([os.path.join(path,x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files != None:
            self.files = files
        print(f"One {path} sample",self.files[0])
        self.transform = tfm
  
    def __len__(self):
        return len(self.files)
  
    def __getitem__(self,idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)
        try:
            label = int(fname.split("/")[-1].split("_")[0])
        except:
            label = -1 # test has no label
        return im,label

In [8]:
# Form train/valid dataloaders
train_set = FoodDataset(os.path.join(cfg['dataset_root'],"training"), tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=cfg['batch_size'], shuffle=True, num_workers=0, pin_memory=True)

valid_set = FoodDataset(os.path.join(cfg['dataset_root'], "validation"), tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=cfg['batch_size'], shuffle=False, num_workers=0, pin_memory=True)

One /kaggle/input/ml2023spring-hw13/Food-11/training sample /kaggle/input/ml2023spring-hw13/Food-11/training/0_0.jpg
One /kaggle/input/ml2023spring-hw13/Food-11/validation sample /kaggle/input/ml2023spring-hw13/Food-11/validation/0_0.jpg


In [9]:
# Define your student network here. You have to copy-paste this code block to HW13 GradeScope before deadline.
# We will use your student network definition to evaluate your results(including the total parameter amount).

# Example implementation of Depthwise and Pointwise Convolution 
def dwpw_conv(in_channels, out_channels, kernel_size, stride=1, padding=1,bias=False):
    return nn.Sequential(
        nn.Conv2d(in_channels, in_channels, kernel_size, stride=stride, padding=padding,bias=bias, groups=in_channels), #depthwise convolution
        nn.BatchNorm2d(in_channels),
        nn.ReLU(inplace=True),
        nn.Conv2d(in_channels, out_channels, 1,  bias= bias,), # pointwise convolution
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True)
    )

class StudentNet(nn.Module):
    def __init__(self, inplanes = 64):
        super().__init__()
        self.inplanes = inplanes
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = dwpw_conv(inplanes,inplanes, kernel_size=3)
        self.layer2 = dwpw_conv(inplanes,128, kernel_size=3, stride=2)
        self.layer3 = dwpw_conv(128, 198, kernel_size=3, stride=2)
        self.layer4 = dwpw_conv(198, 33, kernel_size=3, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(33, 11)

    def forward(self, x):
        x=self.conv1(x)
        x=self.bn1(x)
        x=self.relu(x)
        x=self.maxpool(x)

        x=self.layer1(x)
        x=self.layer2(x)
        x=self.layer3(x)
        x=self.layer4(x)

        x=self.avgpool(x)
        x = torch.flatten(x, 1)
        x=self.fc(x)

        return x
    
def get_student_model(): # This function should have no arguments so that we can get your student network by directly calling it.
    # you can modify or do anything here, just remember to return an nn.Module as your student network.  
    return StudentNet() 

# End of definition of your student model and the get_student_model API
# Please copy-paste the whole code block, including the get_student_model function.

In [10]:
# DO NOT modify this block and please make sure that this block can run sucessfully. 
student_model = get_student_model()
summary(student_model, (3, 224, 224), device='cpu')
# You have to copy&paste the results of this block to HW13 GradeScope. 

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]             576
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]           4,096
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11           [-1, 64, 28, 28]             576
      BatchNorm2d-12           [-1, 64, 28, 28]             128
             ReLU-13           [-1, 64, 28, 28]               0
           Conv2d-14          [-1, 128,

In [11]:
# Load provided teacher model (model architecture: resnet18, num_classes=11, test-acc ~= 89.9%)
teacher_model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False, num_classes=11)
# load state dict
teacher_ckpt_path = os.path.join(cfg['dataset_root'], "resnet18_teacher.ckpt")
teacher_model.load_state_dict(torch.load(teacher_ckpt_path, map_location='cpu'))
# Now you already know the teacher model's architecture. You can take advantage of it if you want to pass the strong or boss baseline. 
# Source code of resnet in pytorch: (https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py)
# You can also see the summary of teacher model. There are 11,182,155 parameters totally in the teacher model
summary(teacher_model, (3, 224, 224), device='cpu')

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

In [12]:
 Slayer1out, Slayer2out, Tlayer1out, Tlayer2out = [], [], [], []

def hookS1(module, input, output):
    Slayer1out.append(output)
    return None

def hookS2(module, input, output):
    Slayer2out.append(output)
    return None

'''
def hookS3(module, input, output):
    Slayer3out.append(output)
    return None
'''

def hookT1(module, input, output):
    Tlayer1out.append(output)
    return None

def hookT2(module, input, output):
    Tlayer2out.append(output)
    return None

'''
def hookT3(module, input, output):
    Tlayer3out.append(output)
    return None
'''

student_model = StudentNet()
student_model.layer1.register_forward_hook(hookS1)
student_model.layer2.register_forward_hook(hookS2)
#student_model.layer3.register_forward_hook(hookS3)

teacher_model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False, num_classes=11)
teacher_ckpt_path = os.path.join(cfg['dataset_root'], "resnet18_teacher.ckpt")
teacher_model.load_state_dict(torch.load(teacher_ckpt_path, map_location='cpu'))

teacher_model.layer1.register_forward_hook(hookT1)
teacher_model.layer2.register_forward_hook(hookT2)
#teacher_model.layer3.register_forward_hook(hookT3)

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


<torch.utils.hooks.RemovableHandle at 0x79c66bd72b90>

In [13]:
# Implement the loss function with KL divergence loss for knowledge distillation.
# You also have to copy-paste this whole block to HW13 GradeScope. 
def loss_fn_kd(student_logits, labels, teacher_logits, alpha=0.99, temperature=25.0):
    
    kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
    CE_loss = nn.CrossEntropyLoss()
    p = F.log_softmax(student_logits / temperature, dim=1)
    q = F.log_softmax(teacher_logits / temperature, dim=1)
    loss=alpha * (temperature**2) * kl_loss(p, q) + (1-alpha) * CE_loss(student_logits,labels)

    return loss
    # ------------TODO-------------
    # Refer to the above formula and finish the loss function for knowkedge distillation using KL divergence loss and CE loss.
    # If you have no idea, please take a look at the provided useful link above.

In [14]:
# choose the loss function by the config
if cfg['loss_fn_type'] == 'CE':
    # For the classification task, we use cross-entropy as the default loss function.
    loss_fn = nn.CrossEntropyLoss() # loss function for simple baseline.

if cfg['loss_fn_type'] == 'KD': # KD stands for knowledge distillation
    loss_fn = loss_fn_kd # implement loss_fn_kd for the report question and the medium baseline.

# You can also adopt other types of knowledge distillation techniques for strong and boss baseline, but use function name other than `loss_fn_kd`
# For example:
# def loss_fn_custom_kd():
#     pass
# if cfg['loss_fn_type'] == 'custom_kd':
#     loss_fn = loss_fn_custom_kd

# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"
log(f"device: {device}")

# The number of training epochs and patience.
n_epochs = cfg['n_epochs']
patience = cfg['patience'] # If no improvement in 'patience' epochs, early stop

device: cuda


In [15]:
#https://gitcode.net/mirrors/jiayoujiayoujiayoua/hung-yi-lee-ml-homework/-/blob/master/hw13/hw13_boss.ipynb
# Initialize a model, and put it on the device specified.
student_model.to(device)
teacher_model.to(device) # MEDIUM BASELINE

# Initialize optimizer, you may fine-tune some hyperparameters such as learning rate on your own.
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, student_model.parameters()), lr=cfg['lr'], weight_decay=cfg['weight_decay']) 
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=9, T_mult=2, eta_min=1e-5)

# Initialize trackers, these are not parameters and should not be changed
stale = 0
best_acc = 0.0

teacher_model.eval()  # MEDIUM BASELINE
for epoch in range(n_epochs):

    # ---------- Training ----------
    # Make sure the model is in train mode before training.
    student_model.train()

    # These are used to record information in training.
    train_loss = []
    train_loss_hidden = []
    train_accs = []
    train_lens = []
    p=epoch/(n_epochs-1)
    lamb= p * p # 0-1
    
    for batch in tqdm(train_loader):
        Slayer1out, Slayer2out, Tlayer1out, Tlayer2out = [], [], [], []
        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)
        #imgs = imgs.half()
        #print(imgs.shape,labels.shape)

        # Forward the data. (Make sure data and model are on the same device.)
        with torch.no_grad():  # MEDIUM BASELINE
            teacher_logits = teacher_model(imgs)  # MEDIUM BASELINE
         
        #print(teacher_logits.shape)  

        
        logits = student_model(imgs)
        #print(logits.shape) 
        slayer1out, slayer2out, tlayer1out, tlayer2out = \
          Slayer1out[0], Slayer2out[0], Tlayer1out[0], Tlayer2out[0]
        # Calculate the cross-entropy loss.
        # We don't need to apply softmax before computing cross-entropy as it is done automatically.
        loss_output = loss_fn_kd(logits, labels, teacher_logits) # MEDIUM BASELINE
        loss_hidden = F.smooth_l1_loss(slayer1out, tlayer1out) + F.smooth_l1_loss(slayer2out, tlayer2out)
        
        loss =  loss_hidden + 10 * lamb * loss_output
        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        # Compute the gradients for parameters.
        loss.backward()

        # Clip the gradient norms for stable training.
        grad_norm = nn.utils.clip_grad_norm_(student_model.parameters(), max_norm=cfg['grad_norm_max'])

        # Update the parameters with computed gradients.
        optimizer.step()

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels).float().sum()

        # Record the loss and accuracy.
        train_batch_len = len(imgs)
        train_loss.append(loss.item() * train_batch_len)
        train_loss_hidden.append(loss_hidden.item() * train_batch_len)
        train_accs.append(acc)
        train_lens.append(train_batch_len)
        
    train_loss = sum(train_loss) / sum(train_lens)
    train_acc = sum(train_accs) / sum(train_lens)
    train_hidden_loss = sum(train_loss_hidden) / sum(train_lens)

    # Print the information.
    log(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, hidden_loss = {train_hidden_loss:.5f}, acc = {train_acc:.5f}")

# ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    student_model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []
    valid_lens = []
    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)

        # We don't need gradient in validation.
        # Using torch.no_grad() accelerates the forward process.
        with torch.no_grad():
            logits = student_model(imgs)
            teacher_logits = teacher_model(imgs) # MEDIUM BASELINE

        # We can still compute the loss (but not the gradient).
        loss = loss_fn_kd(logits, labels, teacher_logits) # MEDIUM BASELINE
        # loss = loss_fn(logits, labels) # SIMPLE BASELINE

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels).float().sum()

        # Record the loss and accuracy.
        batch_len = len(imgs)
        valid_loss.append(loss.item() * batch_len)
        valid_accs.append(acc)
        valid_lens.append(batch_len)
        #break

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / sum(valid_lens)
    valid_acc = sum(valid_accs) / sum(valid_lens)

    # update logs
    
    if valid_acc > best_acc:
        log(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
    else:
        log(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # save models
    if valid_acc > best_acc:
        log(f"Best model found at epoch {epoch}, saving model")
        torch.save(student_model.state_dict(), f"{save_path}/student_best.ckpt") # only save best to prevent output memory exceed error
        best_acc = valid_acc
        stale = 0
    else:
        stale += 1
        if stale > patience:
            log(f"No improvment {patience} consecutive epochs, early stopping")
            break
log("Finish training")
log_fw.close()

  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 001/600 ] loss = 1.13837, hidden_loss = 1.13837, acc = 0.06575


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 001/600 ] loss = 33.52517, acc = 0.06859 -> best
Best model found at epoch 0, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 002/600 ] loss = 0.97663, hidden_loss = 0.97608, acc = 0.25438


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 002/600 ] loss = 31.82894, acc = 0.29152 -> best
Best model found at epoch 1, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 003/600 ] loss = 0.87696, hidden_loss = 0.87491, acc = 0.27449


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 003/600 ] loss = 29.59426, acc = 0.27978


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 004/600 ] loss = 0.79940, hidden_loss = 0.79503, acc = 0.28090


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 004/600 ] loss = 28.60998, acc = 0.30302 -> best
Best model found at epoch 3, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 005/600 ] loss = 0.74140, hidden_loss = 0.73401, acc = 0.30591


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 005/600 ] loss = 27.00215, acc = 0.34296 -> best
Best model found at epoch 4, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 006/600 ] loss = 0.69696, hidden_loss = 0.68605, acc = 0.32863


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 006/600 ] loss = 25.74568, acc = 0.36282 -> best
Best model found at epoch 5, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 007/600 ] loss = 0.66189, hidden_loss = 0.64681, acc = 0.35795


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 007/600 ] loss = 24.64451, acc = 0.38087 -> best
Best model found at epoch 6, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 008/600 ] loss = 0.63437, hidden_loss = 0.61475, acc = 0.38637


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 008/600 ] loss = 23.70538, acc = 0.44314 -> best
Best model found at epoch 7, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 009/600 ] loss = 0.61280, hidden_loss = 0.58827, acc = 0.40909


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 009/600 ] loss = 22.60954, acc = 0.46164 -> best
Best model found at epoch 8, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 010/600 ] loss = 0.59596, hidden_loss = 0.56621, acc = 0.42550


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 010/600 ] loss = 21.82358, acc = 0.47766 -> best
Best model found at epoch 9, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 011/600 ] loss = 0.58267, hidden_loss = 0.54708, acc = 0.44531


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 011/600 ] loss = 21.58296, acc = 0.46819


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 012/600 ] loss = 0.57261, hidden_loss = 0.53127, acc = 0.45642


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 012/600 ] loss = 20.67056, acc = 0.49120 -> best
Best model found at epoch 11, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 013/600 ] loss = 0.56563, hidden_loss = 0.51750, acc = 0.47783


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 013/600 ] loss = 20.19387, acc = 0.51670 -> best
Best model found at epoch 12, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 014/600 ] loss = 0.56084, hidden_loss = 0.50645, acc = 0.49305


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 014/600 ] loss = 19.15735, acc = 0.54693 -> best
Best model found at epoch 13, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 015/600 ] loss = 0.55777, hidden_loss = 0.49623, acc = 0.50385


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 015/600 ] loss = 18.86148, acc = 0.55144 -> best
Best model found at epoch 14, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 016/600 ] loss = 0.55613, hidden_loss = 0.48758, acc = 0.50936


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 016/600 ] loss = 18.14802, acc = 0.55460 -> best
Best model found at epoch 15, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 017/600 ] loss = 0.55712, hidden_loss = 0.48058, acc = 0.52046


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 017/600 ] loss = 17.84053, acc = 0.58371 -> best
Best model found at epoch 16, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 018/600 ] loss = 0.55725, hidden_loss = 0.47395, acc = 0.53177


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 018/600 ] loss = 17.70143, acc = 0.58709 -> best
Best model found at epoch 17, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 019/600 ] loss = 0.56010, hidden_loss = 0.46890, acc = 0.53928


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 019/600 ] loss = 17.13815, acc = 0.60086 -> best
Best model found at epoch 18, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 020/600 ] loss = 0.56302, hidden_loss = 0.46433, acc = 0.54548


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 020/600 ] loss = 16.76683, acc = 0.59838


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 021/600 ] loss = 0.56887, hidden_loss = 0.46109, acc = 0.55689


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 021/600 ] loss = 16.69064, acc = 0.58213


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 022/600 ] loss = 0.57413, hidden_loss = 0.45802, acc = 0.56249


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 022/600 ] loss = 16.84218, acc = 0.59273


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 023/600 ] loss = 0.58040, hidden_loss = 0.45594, acc = 0.56930


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 023/600 ] loss = 15.81887, acc = 0.60537 -> best
Best model found at epoch 22, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 024/600 ] loss = 0.58660, hidden_loss = 0.45324, acc = 0.57390


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 024/600 ] loss = 15.36789, acc = 0.62545 -> best
Best model found at epoch 23, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 025/600 ] loss = 0.59577, hidden_loss = 0.45212, acc = 0.57760


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 025/600 ] loss = 15.28116, acc = 0.62996 -> best
Best model found at epoch 24, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 026/600 ] loss = 0.60290, hidden_loss = 0.45065, acc = 0.58661


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 026/600 ] loss = 15.15446, acc = 0.63493 -> best
Best model found at epoch 25, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 027/600 ] loss = 0.61077, hidden_loss = 0.44912, acc = 0.58941


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 027/600 ] loss = 15.14386, acc = 0.63177


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 028/600 ] loss = 0.62051, hidden_loss = 0.44780, acc = 0.59592


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 028/600 ] loss = 14.86795, acc = 0.63741 -> best
Best model found at epoch 27, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 029/600 ] loss = 0.63322, hidden_loss = 0.44841, acc = 0.59932


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 029/600 ] loss = 14.61679, acc = 0.61981


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 030/600 ] loss = 0.64132, hidden_loss = 0.44709, acc = 0.60312


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 030/600 ] loss = 14.21830, acc = 0.64892 -> best
Best model found at epoch 29, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 031/600 ] loss = 0.65374, hidden_loss = 0.44606, acc = 0.60743


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 031/600 ] loss = 14.10101, acc = 0.65523 -> best
Best model found at epoch 30, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 032/600 ] loss = 0.66066, hidden_loss = 0.44583, acc = 0.61983


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 032/600 ] loss = 14.43027, acc = 0.62613


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 033/600 ] loss = 0.67692, hidden_loss = 0.44619, acc = 0.61633


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 033/600 ] loss = 14.01265, acc = 0.65681 -> best
Best model found at epoch 32, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 034/600 ] loss = 0.68723, hidden_loss = 0.44547, acc = 0.61763


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 034/600 ] loss = 13.87421, acc = 0.66764 -> best
Best model found at epoch 33, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 035/600 ] loss = 0.69641, hidden_loss = 0.44506, acc = 0.62404


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 035/600 ] loss = 13.70579, acc = 0.67825 -> best
Best model found at epoch 34, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 036/600 ] loss = 0.71680, hidden_loss = 0.44617, acc = 0.62053


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 036/600 ] loss = 13.33098, acc = 0.68931 -> best
Best model found at epoch 35, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 037/600 ] loss = 0.72554, hidden_loss = 0.44540, acc = 0.63454


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 037/600 ] loss = 13.49708, acc = 0.67171


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 038/600 ] loss = 0.73709, hidden_loss = 0.44563, acc = 0.62974


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 038/600 ] loss = 13.71583, acc = 0.65794


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 039/600 ] loss = 0.74980, hidden_loss = 0.44529, acc = 0.63544


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 039/600 ] loss = 13.21523, acc = 0.69021 -> best
Best model found at epoch 38, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 040/600 ] loss = 0.76806, hidden_loss = 0.44638, acc = 0.64475


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 040/600 ] loss = 13.13139, acc = 0.68931


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 041/600 ] loss = 0.77621, hidden_loss = 0.44534, acc = 0.63785


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 041/600 ] loss = 12.78089, acc = 0.69404 -> best
Best model found at epoch 40, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 042/600 ] loss = 0.79518, hidden_loss = 0.44632, acc = 0.64455


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 042/600 ] loss = 12.72335, acc = 0.70059 -> best
Best model found at epoch 41, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 043/600 ] loss = 0.80908, hidden_loss = 0.44647, acc = 0.63665


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 043/600 ] loss = 13.07782, acc = 0.69088


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 044/600 ] loss = 0.82903, hidden_loss = 0.44794, acc = 0.64605


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 044/600 ] loss = 12.95786, acc = 0.70217 -> best
Best model found at epoch 43, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 045/600 ] loss = 0.83878, hidden_loss = 0.44696, acc = 0.65446


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 045/600 ] loss = 13.23928, acc = 0.65117


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 046/600 ] loss = 0.85844, hidden_loss = 0.44780, acc = 0.65336


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 046/600 ] loss = 12.56382, acc = 0.70465 -> best
Best model found at epoch 45, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 047/600 ] loss = 0.87014, hidden_loss = 0.44774, acc = 0.64965


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 047/600 ] loss = 13.09431, acc = 0.69946


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 048/600 ] loss = 0.89271, hidden_loss = 0.44904, acc = 0.65466


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 048/600 ] loss = 12.21854, acc = 0.72112 -> best
Best model found at epoch 47, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 049/600 ] loss = 0.89969, hidden_loss = 0.44820, acc = 0.65576


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 049/600 ] loss = 12.07430, acc = 0.71977


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 050/600 ] loss = 0.92495, hidden_loss = 0.44942, acc = 0.66376


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 050/600 ] loss = 11.96248, acc = 0.72405 -> best
Best model found at epoch 49, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 051/600 ] loss = 0.93998, hidden_loss = 0.44982, acc = 0.66206


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 051/600 ] loss = 12.10982, acc = 0.71187


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 052/600 ] loss = 0.95504, hidden_loss = 0.44994, acc = 0.65866


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 052/600 ] loss = 12.08684, acc = 0.70420


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 053/600 ] loss = 0.97720, hidden_loss = 0.45053, acc = 0.66787


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 053/600 ] loss = 11.84333, acc = 0.71570


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 054/600 ] loss = 0.99600, hidden_loss = 0.45092, acc = 0.67087


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 054/600 ] loss = 12.40088, acc = 0.71119


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 055/600 ] loss = 1.01021, hidden_loss = 0.45151, acc = 0.66467


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 055/600 ] loss = 11.93028, acc = 0.71954


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 056/600 ] loss = 1.02390, hidden_loss = 0.45188, acc = 0.67177


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 056/600 ] loss = 11.97227, acc = 0.71142


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 057/600 ] loss = 1.05376, hidden_loss = 0.45294, acc = 0.67097


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 057/600 ] loss = 11.82913, acc = 0.72134


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 058/600 ] loss = 1.06623, hidden_loss = 0.45309, acc = 0.66907


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 058/600 ] loss = 11.66950, acc = 0.73285 -> best
Best model found at epoch 57, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 059/600 ] loss = 1.08712, hidden_loss = 0.45297, acc = 0.66917


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 059/600 ] loss = 11.53232, acc = 0.72180


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 060/600 ] loss = 1.10127, hidden_loss = 0.45383, acc = 0.67497


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 060/600 ] loss = 11.75294, acc = 0.69540


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 061/600 ] loss = 1.11785, hidden_loss = 0.45399, acc = 0.67757


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 061/600 ] loss = 11.93217, acc = 0.71074


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 062/600 ] loss = 1.15119, hidden_loss = 0.45555, acc = 0.67517


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 062/600 ] loss = 12.43079, acc = 0.69788


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 063/600 ] loss = 1.17794, hidden_loss = 0.45603, acc = 0.67567


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 063/600 ] loss = 12.36022, acc = 0.72518


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 064/600 ] loss = 1.17981, hidden_loss = 0.45572, acc = 0.67227


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 064/600 ] loss = 11.44423, acc = 0.73105


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 065/600 ] loss = 1.21518, hidden_loss = 0.45704, acc = 0.67657


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 065/600 ] loss = 11.57029, acc = 0.73714 -> best
Best model found at epoch 64, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 066/600 ] loss = 1.23324, hidden_loss = 0.45807, acc = 0.68278


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 066/600 ] loss = 12.12026, acc = 0.70736


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 067/600 ] loss = 1.25063, hidden_loss = 0.45867, acc = 0.68508


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 067/600 ] loss = 11.98699, acc = 0.72811


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 068/600 ] loss = 1.27354, hidden_loss = 0.45932, acc = 0.67667


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 068/600 ] loss = 11.28297, acc = 0.72134


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 069/600 ] loss = 1.29308, hidden_loss = 0.45879, acc = 0.68378


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 069/600 ] loss = 11.45869, acc = 0.71977


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 070/600 ] loss = 1.31183, hidden_loss = 0.46023, acc = 0.67948


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 070/600 ] loss = 11.37937, acc = 0.71616


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 071/600 ] loss = 1.33639, hidden_loss = 0.46038, acc = 0.68718


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 071/600 ] loss = 11.23645, acc = 0.72586


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 072/600 ] loss = 1.36132, hidden_loss = 0.46134, acc = 0.68948


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 072/600 ] loss = 11.19564, acc = 0.74075 -> best
Best model found at epoch 71, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 073/600 ] loss = 1.36868, hidden_loss = 0.46144, acc = 0.69118


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 073/600 ] loss = 11.65736, acc = 0.73014


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 074/600 ] loss = 1.40497, hidden_loss = 0.46203, acc = 0.68528


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 074/600 ] loss = 11.55793, acc = 0.73533


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 075/600 ] loss = 1.42610, hidden_loss = 0.46308, acc = 0.69349


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 075/600 ] loss = 11.40500, acc = 0.73511


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 076/600 ] loss = 1.46114, hidden_loss = 0.46334, acc = 0.68928


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 076/600 ] loss = 10.91482, acc = 0.72586


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 077/600 ] loss = 1.47343, hidden_loss = 0.46403, acc = 0.69439


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 077/600 ] loss = 11.38805, acc = 0.73060


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 078/600 ] loss = 1.49605, hidden_loss = 0.46401, acc = 0.69549


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 078/600 ] loss = 11.03034, acc = 0.74120 -> best
Best model found at epoch 77, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 079/600 ] loss = 1.50852, hidden_loss = 0.46478, acc = 0.69559


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 079/600 ] loss = 11.18681, acc = 0.73308


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 080/600 ] loss = 1.55415, hidden_loss = 0.46528, acc = 0.69309


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 080/600 ] loss = 10.88497, acc = 0.73014


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 081/600 ] loss = 1.58164, hidden_loss = 0.46624, acc = 0.69989


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 081/600 ] loss = 11.09686, acc = 0.72338


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 082/600 ] loss = 1.58463, hidden_loss = 0.46657, acc = 0.70139


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 082/600 ] loss = 10.87152, acc = 0.74075


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 083/600 ] loss = 1.61792, hidden_loss = 0.46786, acc = 0.69148


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 083/600 ] loss = 10.99664, acc = 0.74120


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 084/600 ] loss = 1.63512, hidden_loss = 0.46703, acc = 0.69889


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 084/600 ] loss = 11.86003, acc = 0.71796


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 085/600 ] loss = 1.67836, hidden_loss = 0.46806, acc = 0.69979


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 085/600 ] loss = 10.75037, acc = 0.73646


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 086/600 ] loss = 1.69878, hidden_loss = 0.46972, acc = 0.70269


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 086/600 ] loss = 10.86326, acc = 0.73443


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 087/600 ] loss = 1.72427, hidden_loss = 0.47037, acc = 0.69749


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 087/600 ] loss = 10.51212, acc = 0.74865 -> best
Best model found at epoch 86, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 088/600 ] loss = 1.76504, hidden_loss = 0.47150, acc = 0.70029


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 088/600 ] loss = 10.82653, acc = 0.74752


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 089/600 ] loss = 1.78053, hidden_loss = 0.47126, acc = 0.70439


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 089/600 ] loss = 11.27261, acc = 0.73759


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 090/600 ] loss = 1.81053, hidden_loss = 0.47234, acc = 0.70529


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 090/600 ] loss = 10.97212, acc = 0.74368


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 091/600 ] loss = 1.82957, hidden_loss = 0.47259, acc = 0.70529


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 091/600 ] loss = 10.66035, acc = 0.74368


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 092/600 ] loss = 1.86204, hidden_loss = 0.47283, acc = 0.70429


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 092/600 ] loss = 10.70699, acc = 0.75248 -> best
Best model found at epoch 91, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 093/600 ] loss = 1.89338, hidden_loss = 0.47311, acc = 0.70489


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 093/600 ] loss = 11.13607, acc = 0.74052


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 094/600 ] loss = 1.92117, hidden_loss = 0.47330, acc = 0.70409


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 094/600 ] loss = 10.94383, acc = 0.75023


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 095/600 ] loss = 1.94892, hidden_loss = 0.47528, acc = 0.70539


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 095/600 ] loss = 10.49038, acc = 0.74526


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 096/600 ] loss = 1.98813, hidden_loss = 0.47549, acc = 0.71010


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 096/600 ] loss = 10.43643, acc = 0.75158


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 097/600 ] loss = 1.99337, hidden_loss = 0.47502, acc = 0.70800


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 097/600 ] loss = 10.58118, acc = 0.76399 -> best
Best model found at epoch 96, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 098/600 ] loss = 2.02702, hidden_loss = 0.47668, acc = 0.70599


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 098/600 ] loss = 11.10036, acc = 0.74097


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 099/600 ] loss = 2.05479, hidden_loss = 0.47642, acc = 0.71690


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 099/600 ] loss = 10.43657, acc = 0.75338


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 100/600 ] loss = 2.09233, hidden_loss = 0.47769, acc = 0.71520


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 100/600 ] loss = 10.58772, acc = 0.75474


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 101/600 ] loss = 2.13263, hidden_loss = 0.47769, acc = 0.71240


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 101/600 ] loss = 10.42669, acc = 0.75767


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 102/600 ] loss = 2.14360, hidden_loss = 0.47839, acc = 0.71620


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 102/600 ] loss = 10.57593, acc = 0.75316


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 103/600 ] loss = 2.17420, hidden_loss = 0.47964, acc = 0.71670


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 103/600 ] loss = 10.74244, acc = 0.73985


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 104/600 ] loss = 2.22319, hidden_loss = 0.47974, acc = 0.71070


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 104/600 ] loss = 10.31368, acc = 0.75677


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 105/600 ] loss = 2.23253, hidden_loss = 0.48015, acc = 0.70780


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 105/600 ] loss = 10.39756, acc = 0.74616


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 106/600 ] loss = 2.27436, hidden_loss = 0.48102, acc = 0.71800


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 106/600 ] loss = 10.21632, acc = 0.75699


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 107/600 ] loss = 2.30324, hidden_loss = 0.48087, acc = 0.71310


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 107/600 ] loss = 10.27980, acc = 0.76038


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 108/600 ] loss = 2.32966, hidden_loss = 0.48138, acc = 0.72080


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 108/600 ] loss = 10.64116, acc = 0.76963 -> best
Best model found at epoch 107, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 109/600 ] loss = 2.38236, hidden_loss = 0.48239, acc = 0.71390


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 109/600 ] loss = 10.29686, acc = 0.76828


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 110/600 ] loss = 2.38288, hidden_loss = 0.48315, acc = 0.71660


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 110/600 ] loss = 10.41562, acc = 0.74458


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 111/600 ] loss = 2.42181, hidden_loss = 0.48325, acc = 0.71230


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 111/600 ] loss = 10.10636, acc = 0.75451


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 112/600 ] loss = 2.46220, hidden_loss = 0.48469, acc = 0.71700


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 112/600 ] loss = 10.19383, acc = 0.76173


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 113/600 ] loss = 2.45523, hidden_loss = 0.48448, acc = 0.71320


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 113/600 ] loss = 10.22624, acc = 0.76196


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 114/600 ] loss = 2.51446, hidden_loss = 0.48584, acc = 0.71960


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 114/600 ] loss = 10.78925, acc = 0.75654


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 115/600 ] loss = 2.54746, hidden_loss = 0.48570, acc = 0.72311


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 115/600 ] loss = 10.10047, acc = 0.76106


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 116/600 ] loss = 2.60897, hidden_loss = 0.48672, acc = 0.71560


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 116/600 ] loss = 10.16341, acc = 0.74977


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 117/600 ] loss = 2.59845, hidden_loss = 0.48650, acc = 0.72201


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 117/600 ] loss = 10.46053, acc = 0.75564


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 118/600 ] loss = 2.66881, hidden_loss = 0.48791, acc = 0.72191


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 118/600 ] loss = 10.41839, acc = 0.75948


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 119/600 ] loss = 2.70825, hidden_loss = 0.48827, acc = 0.71960


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 119/600 ] loss = 10.20022, acc = 0.75925


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 120/600 ] loss = 2.72778, hidden_loss = 0.48891, acc = 0.72221


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 120/600 ] loss = 9.87043, acc = 0.76692


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 121/600 ] loss = 2.76353, hidden_loss = 0.48957, acc = 0.72070


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 121/600 ] loss = 10.25984, acc = 0.76309


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 122/600 ] loss = 2.81719, hidden_loss = 0.49096, acc = 0.72090


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 122/600 ] loss = 10.05237, acc = 0.76083


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 123/600 ] loss = 2.84176, hidden_loss = 0.49127, acc = 0.72331


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 123/600 ] loss = 10.17553, acc = 0.76173


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 124/600 ] loss = 2.83435, hidden_loss = 0.49151, acc = 0.72611


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 124/600 ] loss = 9.98795, acc = 0.77617 -> best
Best model found at epoch 123, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 125/600 ] loss = 2.90908, hidden_loss = 0.49207, acc = 0.72751


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 125/600 ] loss = 9.93772, acc = 0.76241


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 126/600 ] loss = 2.92590, hidden_loss = 0.49256, acc = 0.72731


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 126/600 ] loss = 10.20957, acc = 0.76986


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 127/600 ] loss = 2.94965, hidden_loss = 0.49405, acc = 0.72341


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 127/600 ] loss = 10.17724, acc = 0.76918


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 128/600 ] loss = 2.99114, hidden_loss = 0.49496, acc = 0.73041


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 128/600 ] loss = 10.04188, acc = 0.77234


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 129/600 ] loss = 2.99789, hidden_loss = 0.49365, acc = 0.72501


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 129/600 ] loss = 10.34242, acc = 0.74662


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 130/600 ] loss = 3.05607, hidden_loss = 0.49481, acc = 0.72901


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 130/600 ] loss = 10.09039, acc = 0.77008


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 131/600 ] loss = 3.13307, hidden_loss = 0.49590, acc = 0.72741


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 131/600 ] loss = 10.08439, acc = 0.76399


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 132/600 ] loss = 3.13866, hidden_loss = 0.49669, acc = 0.72861


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 132/600 ] loss = 10.39264, acc = 0.74910


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 133/600 ] loss = 3.22041, hidden_loss = 0.49739, acc = 0.73531


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 133/600 ] loss = 10.07812, acc = 0.76038


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 134/600 ] loss = 3.23416, hidden_loss = 0.49820, acc = 0.72781


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 134/600 ] loss = 10.04200, acc = 0.76106


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 135/600 ] loss = 3.26300, hidden_loss = 0.49782, acc = 0.72551


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 135/600 ] loss = 10.06173, acc = 0.76421


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 136/600 ] loss = 3.31733, hidden_loss = 0.49870, acc = 0.73672


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 136/600 ] loss = 9.80605, acc = 0.76986


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 137/600 ] loss = 3.31012, hidden_loss = 0.49914, acc = 0.73091


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 137/600 ] loss = 9.79178, acc = 0.77324


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 138/600 ] loss = 3.35638, hidden_loss = 0.50037, acc = 0.73862


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 138/600 ] loss = 9.81129, acc = 0.77031


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 139/600 ] loss = 3.40880, hidden_loss = 0.50137, acc = 0.72741


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 139/600 ] loss = 10.26988, acc = 0.77708 -> best
Best model found at epoch 138, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 140/600 ] loss = 3.46557, hidden_loss = 0.50258, acc = 0.72861


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 140/600 ] loss = 10.09178, acc = 0.77392


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 141/600 ] loss = 3.47489, hidden_loss = 0.50216, acc = 0.72701


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 141/600 ] loss = 9.97729, acc = 0.76534


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 142/600 ] loss = 3.53216, hidden_loss = 0.50385, acc = 0.72651


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 142/600 ] loss = 9.80574, acc = 0.76038


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 143/600 ] loss = 3.57651, hidden_loss = 0.50398, acc = 0.72671


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 143/600 ] loss = 9.74722, acc = 0.76737


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 144/600 ] loss = 3.60302, hidden_loss = 0.50379, acc = 0.73061


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 144/600 ] loss = 10.02677, acc = 0.76467


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 145/600 ] loss = 3.63502, hidden_loss = 0.50473, acc = 0.72961


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 145/600 ] loss = 10.13704, acc = 0.77098


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 146/600 ] loss = 3.69067, hidden_loss = 0.50564, acc = 0.72791


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 146/600 ] loss = 9.95404, acc = 0.77640


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 147/600 ] loss = 3.73063, hidden_loss = 0.50695, acc = 0.72831


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 147/600 ] loss = 9.94126, acc = 0.77166


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 148/600 ] loss = 3.74974, hidden_loss = 0.50673, acc = 0.73221


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 148/600 ] loss = 10.09168, acc = 0.77753 -> best
Best model found at epoch 147, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 149/600 ] loss = 3.83429, hidden_loss = 0.50758, acc = 0.73191


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 149/600 ] loss = 9.86228, acc = 0.76015


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 150/600 ] loss = 3.81272, hidden_loss = 0.50831, acc = 0.73161


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 150/600 ] loss = 9.83356, acc = 0.76828


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 151/600 ] loss = 3.88493, hidden_loss = 0.50967, acc = 0.72971


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 151/600 ] loss = 9.96057, acc = 0.76782


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 152/600 ] loss = 3.88795, hidden_loss = 0.50962, acc = 0.73602


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 152/600 ] loss = 9.79506, acc = 0.77053


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 153/600 ] loss = 3.94797, hidden_loss = 0.51052, acc = 0.73632


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 153/600 ] loss = 10.07003, acc = 0.76918


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 154/600 ] loss = 4.01320, hidden_loss = 0.51111, acc = 0.73832


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 154/600 ] loss = 9.54182, acc = 0.78023 -> best
Best model found at epoch 153, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 155/600 ] loss = 4.02682, hidden_loss = 0.51141, acc = 0.73161


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 155/600 ] loss = 9.69754, acc = 0.77459


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 156/600 ] loss = 4.08287, hidden_loss = 0.51287, acc = 0.73912


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 156/600 ] loss = 9.62436, acc = 0.77911


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 157/600 ] loss = 4.10051, hidden_loss = 0.51225, acc = 0.73261


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 157/600 ] loss = 9.55684, acc = 0.77098


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 158/600 ] loss = 4.18922, hidden_loss = 0.51375, acc = 0.73662


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 158/600 ] loss = 9.58706, acc = 0.77708


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 159/600 ] loss = 4.20850, hidden_loss = 0.51329, acc = 0.74082


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 159/600 ] loss = 9.75959, acc = 0.78384 -> best
Best model found at epoch 158, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 160/600 ] loss = 4.24702, hidden_loss = 0.51390, acc = 0.73281


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 160/600 ] loss = 9.50580, acc = 0.78475 -> best
Best model found at epoch 159, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 161/600 ] loss = 4.28069, hidden_loss = 0.51438, acc = 0.73762


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 161/600 ] loss = 9.35612, acc = 0.77369


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 162/600 ] loss = 4.33390, hidden_loss = 0.51506, acc = 0.73331


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 162/600 ] loss = 9.73665, acc = 0.78294


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 163/600 ] loss = 4.37597, hidden_loss = 0.51585, acc = 0.73341


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 163/600 ] loss = 9.63928, acc = 0.78994 -> best
Best model found at epoch 162, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 164/600 ] loss = 4.41058, hidden_loss = 0.51646, acc = 0.74142


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 164/600 ] loss = 9.51937, acc = 0.77482


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 165/600 ] loss = 4.49758, hidden_loss = 0.51720, acc = 0.74032


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 165/600 ] loss = 9.77337, acc = 0.77189


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 166/600 ] loss = 4.53108, hidden_loss = 0.51764, acc = 0.74082


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 166/600 ] loss = 9.59417, acc = 0.77437


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 167/600 ] loss = 4.55772, hidden_loss = 0.51881, acc = 0.73662


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 167/600 ] loss = 9.68804, acc = 0.77550


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 168/600 ] loss = 4.58318, hidden_loss = 0.51963, acc = 0.73952


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 168/600 ] loss = 9.85405, acc = 0.76940


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 169/600 ] loss = 4.67752, hidden_loss = 0.52027, acc = 0.74422


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 169/600 ] loss = 9.68126, acc = 0.77324


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 170/600 ] loss = 4.75415, hidden_loss = 0.52048, acc = 0.73972


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 170/600 ] loss = 9.66323, acc = 0.77505


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 171/600 ] loss = 4.79761, hidden_loss = 0.52222, acc = 0.73501


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 171/600 ] loss = 9.54853, acc = 0.77933


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 172/600 ] loss = 4.74976, hidden_loss = 0.52232, acc = 0.73561


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 172/600 ] loss = 9.46431, acc = 0.78903


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 173/600 ] loss = 4.87515, hidden_loss = 0.52365, acc = 0.73812


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 173/600 ] loss = 9.62045, acc = 0.77505


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 174/600 ] loss = 4.86333, hidden_loss = 0.52391, acc = 0.74002


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 174/600 ] loss = 9.83209, acc = 0.78384


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 175/600 ] loss = 4.91327, hidden_loss = 0.52483, acc = 0.74102


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 175/600 ] loss = 9.64366, acc = 0.78339


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 176/600 ] loss = 4.98371, hidden_loss = 0.52558, acc = 0.74232


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 176/600 ] loss = 9.46400, acc = 0.77866


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 177/600 ] loss = 5.04858, hidden_loss = 0.52612, acc = 0.74212


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 177/600 ] loss = 9.43830, acc = 0.78430


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 178/600 ] loss = 5.06714, hidden_loss = 0.52649, acc = 0.73622


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 178/600 ] loss = 9.55133, acc = 0.78339


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 179/600 ] loss = 5.15225, hidden_loss = 0.52717, acc = 0.74412


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 179/600 ] loss = 9.53247, acc = 0.77662


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 180/600 ] loss = 5.19904, hidden_loss = 0.52657, acc = 0.74052


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 180/600 ] loss = 9.52821, acc = 0.77256


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 181/600 ] loss = 5.21235, hidden_loss = 0.52832, acc = 0.74062


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 181/600 ] loss = 9.42500, acc = 0.77708


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 182/600 ] loss = 5.27375, hidden_loss = 0.52863, acc = 0.74572


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 182/600 ] loss = 9.96924, acc = 0.77369


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 183/600 ] loss = 5.32214, hidden_loss = 0.52946, acc = 0.73972


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 183/600 ] loss = 9.39906, acc = 0.77933


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 184/600 ] loss = 5.27530, hidden_loss = 0.53093, acc = 0.74522


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 184/600 ] loss = 9.39500, acc = 0.77798


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 185/600 ] loss = 5.40545, hidden_loss = 0.53132, acc = 0.74592


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 185/600 ] loss = 9.40844, acc = 0.78317


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 186/600 ] loss = 5.47200, hidden_loss = 0.53075, acc = 0.74452


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 186/600 ] loss = 9.69589, acc = 0.78384


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 187/600 ] loss = 5.47874, hidden_loss = 0.53145, acc = 0.73872


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 187/600 ] loss = 9.45313, acc = 0.79106 -> best
Best model found at epoch 186, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 188/600 ] loss = 5.59754, hidden_loss = 0.53237, acc = 0.73932


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 188/600 ] loss = 9.65773, acc = 0.77279


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 189/600 ] loss = 5.61866, hidden_loss = 0.53307, acc = 0.73782


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 189/600 ] loss = 9.39598, acc = 0.77301


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 190/600 ] loss = 5.66054, hidden_loss = 0.53456, acc = 0.74052


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 190/600 ] loss = 9.35865, acc = 0.78181


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 191/600 ] loss = 5.72986, hidden_loss = 0.53511, acc = 0.74612


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 191/600 ] loss = 9.37097, acc = 0.78023


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 192/600 ] loss = 5.75180, hidden_loss = 0.53554, acc = 0.74112


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 192/600 ] loss = 9.55207, acc = 0.78249


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 193/600 ] loss = 5.81267, hidden_loss = 0.53660, acc = 0.74902


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 193/600 ] loss = 9.26594, acc = 0.78362


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 194/600 ] loss = 5.82916, hidden_loss = 0.53599, acc = 0.73762


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 194/600 ] loss = 9.50410, acc = 0.78407


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 195/600 ] loss = 5.86951, hidden_loss = 0.53729, acc = 0.74182


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 195/600 ] loss = 9.42803, acc = 0.79197 -> best
Best model found at epoch 194, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 196/600 ] loss = 5.96815, hidden_loss = 0.53821, acc = 0.74822


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 196/600 ] loss = 9.37440, acc = 0.79129


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 197/600 ] loss = 5.98736, hidden_loss = 0.53850, acc = 0.74492


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 197/600 ] loss = 9.38380, acc = 0.77662


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 198/600 ] loss = 6.12242, hidden_loss = 0.53925, acc = 0.74282


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 198/600 ] loss = 9.37249, acc = 0.77978


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 199/600 ] loss = 6.03545, hidden_loss = 0.53853, acc = 0.74872


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 199/600 ] loss = 9.35053, acc = 0.78542


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 200/600 ] loss = 6.22773, hidden_loss = 0.54166, acc = 0.74902


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 200/600 ] loss = 9.21882, acc = 0.78407


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 201/600 ] loss = 6.16650, hidden_loss = 0.54029, acc = 0.74992


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 201/600 ] loss = 9.50894, acc = 0.78565


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 202/600 ] loss = 6.27383, hidden_loss = 0.54232, acc = 0.75203


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 202/600 ] loss = 9.29724, acc = 0.78700


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 203/600 ] loss = 6.31815, hidden_loss = 0.54244, acc = 0.74822


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 203/600 ] loss = 9.36776, acc = 0.79400 -> best
Best model found at epoch 202, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 204/600 ] loss = 6.33428, hidden_loss = 0.54267, acc = 0.74882


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 204/600 ] loss = 9.26709, acc = 0.78633


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 205/600 ] loss = 6.43122, hidden_loss = 0.54329, acc = 0.75163


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 205/600 ] loss = 10.01064, acc = 0.78294


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 206/600 ] loss = 6.50150, hidden_loss = 0.54438, acc = 0.74332


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 206/600 ] loss = 9.24081, acc = 0.78588


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 207/600 ] loss = 6.54211, hidden_loss = 0.54494, acc = 0.74332


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 207/600 ] loss = 9.14755, acc = 0.78678


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 208/600 ] loss = 6.58260, hidden_loss = 0.54516, acc = 0.75383


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 208/600 ] loss = 9.43179, acc = 0.78768


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 209/600 ] loss = 6.65056, hidden_loss = 0.54574, acc = 0.75253


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 209/600 ] loss = 9.40607, acc = 0.77482


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 210/600 ] loss = 6.61899, hidden_loss = 0.54613, acc = 0.74992


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 210/600 ] loss = 9.13456, acc = 0.78430


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 211/600 ] loss = 6.71870, hidden_loss = 0.54667, acc = 0.75573


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 211/600 ] loss = 9.23045, acc = 0.78069


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 212/600 ] loss = 6.80122, hidden_loss = 0.54685, acc = 0.74652


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 212/600 ] loss = 9.67870, acc = 0.77144


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 213/600 ] loss = 6.82631, hidden_loss = 0.54715, acc = 0.74922


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 213/600 ] loss = 9.29412, acc = 0.79174


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 214/600 ] loss = 6.86142, hidden_loss = 0.54815, acc = 0.74662


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 214/600 ] loss = 9.21590, acc = 0.77978


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 215/600 ] loss = 6.93060, hidden_loss = 0.54963, acc = 0.75163


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 215/600 ] loss = 9.41406, acc = 0.78294


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 216/600 ] loss = 6.95724, hidden_loss = 0.54925, acc = 0.75123


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 216/600 ] loss = 9.24628, acc = 0.77866


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 217/600 ] loss = 7.05006, hidden_loss = 0.55037, acc = 0.75243


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 217/600 ] loss = 9.57165, acc = 0.76376


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 218/600 ] loss = 7.07068, hidden_loss = 0.55008, acc = 0.74422


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 218/600 ] loss = 9.43194, acc = 0.78384


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 219/600 ] loss = 7.11670, hidden_loss = 0.55213, acc = 0.75613


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 219/600 ] loss = 9.55061, acc = 0.77978


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 220/600 ] loss = 7.23553, hidden_loss = 0.55220, acc = 0.74892


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 220/600 ] loss = 9.40626, acc = 0.78249


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 221/600 ] loss = 7.29082, hidden_loss = 0.55282, acc = 0.74602


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 221/600 ] loss = 9.44107, acc = 0.78520


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 222/600 ] loss = 7.44603, hidden_loss = 0.55354, acc = 0.75803


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 222/600 ] loss = 9.13907, acc = 0.79039


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 223/600 ] loss = 7.41298, hidden_loss = 0.55402, acc = 0.74822


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 223/600 ] loss = 9.25313, acc = 0.77956


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 224/600 ] loss = 7.46099, hidden_loss = 0.55421, acc = 0.74992


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 224/600 ] loss = 9.06521, acc = 0.78542


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 225/600 ] loss = 7.44263, hidden_loss = 0.55490, acc = 0.75073


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 225/600 ] loss = 9.19702, acc = 0.78497


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 226/600 ] loss = 7.56318, hidden_loss = 0.55518, acc = 0.75593


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 226/600 ] loss = 9.23398, acc = 0.78542


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 227/600 ] loss = 7.64311, hidden_loss = 0.55572, acc = 0.75023


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 227/600 ] loss = 8.99970, acc = 0.78542


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 228/600 ] loss = 7.66828, hidden_loss = 0.55612, acc = 0.75563


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 228/600 ] loss = 9.11544, acc = 0.79219


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 229/600 ] loss = 7.68484, hidden_loss = 0.55888, acc = 0.75253


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 229/600 ] loss = 9.38410, acc = 0.77843


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 230/600 ] loss = 7.77326, hidden_loss = 0.55826, acc = 0.75493


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 230/600 ] loss = 9.05857, acc = 0.78497


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 231/600 ] loss = 7.83801, hidden_loss = 0.55781, acc = 0.75053


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 231/600 ] loss = 9.10064, acc = 0.79377


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 232/600 ] loss = 7.89352, hidden_loss = 0.55812, acc = 0.75533


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 232/600 ] loss = 9.19468, acc = 0.79671 -> best
Best model found at epoch 231, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 233/600 ] loss = 7.90030, hidden_loss = 0.55766, acc = 0.75043


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 233/600 ] loss = 9.06531, acc = 0.78407


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 234/600 ] loss = 7.96005, hidden_loss = 0.55887, acc = 0.75473


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 234/600 ] loss = 9.15970, acc = 0.79445


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 235/600 ] loss = 8.01685, hidden_loss = 0.56061, acc = 0.74752


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 235/600 ] loss = 9.11279, acc = 0.79580


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 236/600 ] loss = 8.12680, hidden_loss = 0.56183, acc = 0.74802


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 236/600 ] loss = 9.12545, acc = 0.79896 -> best
Best model found at epoch 235, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 237/600 ] loss = 8.25830, hidden_loss = 0.56160, acc = 0.75813


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 237/600 ] loss = 9.41254, acc = 0.77911


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 238/600 ] loss = 8.19529, hidden_loss = 0.56229, acc = 0.75343


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 238/600 ] loss = 9.05956, acc = 0.79806


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 239/600 ] loss = 8.33270, hidden_loss = 0.56294, acc = 0.75293


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 239/600 ] loss = 9.24224, acc = 0.77685


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 240/600 ] loss = 8.31221, hidden_loss = 0.56264, acc = 0.75513


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 240/600 ] loss = 8.87621, acc = 0.79106


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 241/600 ] loss = 8.42410, hidden_loss = 0.56497, acc = 0.75723


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 241/600 ] loss = 9.36489, acc = 0.79558


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 242/600 ] loss = 8.56852, hidden_loss = 0.56460, acc = 0.75873


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 242/600 ] loss = 9.08113, acc = 0.79377


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 243/600 ] loss = 8.61923, hidden_loss = 0.56528, acc = 0.75643


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 243/600 ] loss = 9.37040, acc = 0.78023


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 244/600 ] loss = 8.55146, hidden_loss = 0.56422, acc = 0.74962


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 244/600 ] loss = 9.11317, acc = 0.78520


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 245/600 ] loss = 8.66405, hidden_loss = 0.56602, acc = 0.75723


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 245/600 ] loss = 8.94854, acc = 0.79061


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 246/600 ] loss = 8.58132, hidden_loss = 0.56599, acc = 0.75173


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 246/600 ] loss = 9.10026, acc = 0.78339


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 247/600 ] loss = 8.80729, hidden_loss = 0.56690, acc = 0.76073


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 247/600 ] loss = 9.39141, acc = 0.77098


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 248/600 ] loss = 9.02300, hidden_loss = 0.56683, acc = 0.75173


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 248/600 ] loss = 9.08885, acc = 0.78633


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 249/600 ] loss = 8.83764, hidden_loss = 0.56849, acc = 0.75493


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 249/600 ] loss = 9.13100, acc = 0.78565


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 250/600 ] loss = 8.91599, hidden_loss = 0.56939, acc = 0.76003


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 250/600 ] loss = 9.31728, acc = 0.77933


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 251/600 ] loss = 8.96531, hidden_loss = 0.56951, acc = 0.75053


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 251/600 ] loss = 9.14951, acc = 0.79197


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 252/600 ] loss = 9.09427, hidden_loss = 0.56916, acc = 0.75533


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 252/600 ] loss = 9.16186, acc = 0.78475


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 253/600 ] loss = 9.17539, hidden_loss = 0.56968, acc = 0.75433


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 253/600 ] loss = 9.19721, acc = 0.77662


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 254/600 ] loss = 9.30133, hidden_loss = 0.57039, acc = 0.75773


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 254/600 ] loss = 9.00743, acc = 0.80551 -> best
Best model found at epoch 253, saving model


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 255/600 ] loss = 9.15113, hidden_loss = 0.57169, acc = 0.75743


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 255/600 ] loss = 9.18734, acc = 0.79129


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 256/600 ] loss = 9.25054, hidden_loss = 0.57212, acc = 0.75393


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 256/600 ] loss = 9.11279, acc = 0.78384


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 257/600 ] loss = 9.30632, hidden_loss = 0.57188, acc = 0.75703


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 257/600 ] loss = 9.15947, acc = 0.78836


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 258/600 ] loss = 9.55298, hidden_loss = 0.57237, acc = 0.75853


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 258/600 ] loss = 9.07215, acc = 0.79152


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 259/600 ] loss = 9.56031, hidden_loss = 0.57272, acc = 0.75453


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 259/600 ] loss = 9.24953, acc = 0.78813


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 260/600 ] loss = 9.70545, hidden_loss = 0.57451, acc = 0.75643


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 260/600 ] loss = 9.25311, acc = 0.79355


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 261/600 ] loss = 9.68570, hidden_loss = 0.57445, acc = 0.75323


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 261/600 ] loss = 8.92575, acc = 0.77956


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 262/600 ] loss = 9.69038, hidden_loss = 0.57603, acc = 0.76143


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 262/600 ] loss = 8.92006, acc = 0.78994


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 263/600 ] loss = 9.70575, hidden_loss = 0.57579, acc = 0.76333


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 263/600 ] loss = 9.01317, acc = 0.79377


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 264/600 ] loss = 9.76700, hidden_loss = 0.57597, acc = 0.75553


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 264/600 ] loss = 8.93724, acc = 0.79242


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 265/600 ] loss = 9.92038, hidden_loss = 0.57642, acc = 0.75963


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 265/600 ] loss = 9.21036, acc = 0.80257


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 266/600 ] loss = 10.17941, hidden_loss = 0.57655, acc = 0.75313


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 266/600 ] loss = 8.87275, acc = 0.79445


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 267/600 ] loss = 10.04519, hidden_loss = 0.57733, acc = 0.76353


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 267/600 ] loss = 9.10117, acc = 0.79806


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 268/600 ] loss = 10.12054, hidden_loss = 0.57710, acc = 0.75713


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 268/600 ] loss = 8.87321, acc = 0.79648


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 269/600 ] loss = 10.17488, hidden_loss = 0.57755, acc = 0.75423


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 269/600 ] loss = 9.11805, acc = 0.78903


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 270/600 ] loss = 10.23203, hidden_loss = 0.57910, acc = 0.76213


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 270/600 ] loss = 9.07485, acc = 0.78384


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 271/600 ] loss = 10.42224, hidden_loss = 0.57928, acc = 0.76073


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 271/600 ] loss = 9.12222, acc = 0.78633


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 272/600 ] loss = 10.38281, hidden_loss = 0.57976, acc = 0.75993


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 272/600 ] loss = 9.38858, acc = 0.79603


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 273/600 ] loss = 10.41125, hidden_loss = 0.58062, acc = 0.75633


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 273/600 ] loss = 9.17470, acc = 0.79377


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 274/600 ] loss = 10.40737, hidden_loss = 0.58060, acc = 0.75973


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 274/600 ] loss = 9.09354, acc = 0.78903


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 275/600 ] loss = 10.48828, hidden_loss = 0.58175, acc = 0.76023


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 275/600 ] loss = 8.97189, acc = 0.79084


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 276/600 ] loss = 10.66007, hidden_loss = 0.58135, acc = 0.76824


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 276/600 ] loss = 8.93700, acc = 0.78926


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 277/600 ] loss = 10.66712, hidden_loss = 0.58280, acc = 0.75823


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 277/600 ] loss = 9.38480, acc = 0.77459


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 278/600 ] loss = 10.67940, hidden_loss = 0.58341, acc = 0.76383


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 278/600 ] loss = 8.84637, acc = 0.79287


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 279/600 ] loss = 10.70448, hidden_loss = 0.58448, acc = 0.75953


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 279/600 ] loss = 9.17388, acc = 0.79896


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 280/600 ] loss = 10.96352, hidden_loss = 0.58432, acc = 0.76263


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 280/600 ] loss = 8.84575, acc = 0.79084


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 281/600 ] loss = 11.01282, hidden_loss = 0.58431, acc = 0.76243


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 281/600 ] loss = 9.06683, acc = 0.78430


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 282/600 ] loss = 10.87295, hidden_loss = 0.58463, acc = 0.76554


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 282/600 ] loss = 8.90489, acc = 0.78633


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 283/600 ] loss = 10.98501, hidden_loss = 0.58532, acc = 0.75723


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 283/600 ] loss = 9.08001, acc = 0.78497


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 284/600 ] loss = 11.15290, hidden_loss = 0.58612, acc = 0.76273


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 284/600 ] loss = 8.91640, acc = 0.79332


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 285/600 ] loss = 11.09931, hidden_loss = 0.58718, acc = 0.76454


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 285/600 ] loss = 8.86427, acc = 0.79332


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 286/600 ] loss = 11.17073, hidden_loss = 0.58702, acc = 0.76223


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 286/600 ] loss = 9.02252, acc = 0.79671


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 287/600 ] loss = 11.43958, hidden_loss = 0.58799, acc = 0.75863


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 287/600 ] loss = 8.89473, acc = 0.78813


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 288/600 ] loss = 11.50065, hidden_loss = 0.58846, acc = 0.76484


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 288/600 ] loss = 8.90563, acc = 0.79422


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 289/600 ] loss = 11.49701, hidden_loss = 0.58836, acc = 0.76393


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 289/600 ] loss = 9.09919, acc = 0.78610


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 290/600 ] loss = 11.43132, hidden_loss = 0.58861, acc = 0.76544


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 290/600 ] loss = 8.85144, acc = 0.79355


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 291/600 ] loss = 11.62414, hidden_loss = 0.58902, acc = 0.76383


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 291/600 ] loss = 8.95069, acc = 0.79558


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 292/600 ] loss = 11.81870, hidden_loss = 0.58992, acc = 0.76193


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 292/600 ] loss = 9.09442, acc = 0.78723


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 293/600 ] loss = 11.81390, hidden_loss = 0.59215, acc = 0.76213


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 293/600 ] loss = 8.86730, acc = 0.80054


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 294/600 ] loss = 12.03630, hidden_loss = 0.59171, acc = 0.75793


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 294/600 ] loss = 8.89533, acc = 0.78272


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 295/600 ] loss = 11.73457, hidden_loss = 0.59088, acc = 0.76013


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 295/600 ] loss = 9.11976, acc = 0.78069


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 296/600 ] loss = 11.85544, hidden_loss = 0.59181, acc = 0.76664


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 296/600 ] loss = 8.83832, acc = 0.80190


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 297/600 ] loss = 12.07268, hidden_loss = 0.59247, acc = 0.76343


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 297/600 ] loss = 8.78775, acc = 0.78926


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 298/600 ] loss = 12.16074, hidden_loss = 0.59359, acc = 0.76373


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 298/600 ] loss = 8.99979, acc = 0.79016


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 299/600 ] loss = 12.30393, hidden_loss = 0.59412, acc = 0.76313


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 299/600 ] loss = 8.93506, acc = 0.77775


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 300/600 ] loss = 12.23920, hidden_loss = 0.59407, acc = 0.76023


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 300/600 ] loss = 9.00770, acc = 0.79355


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 301/600 ] loss = 12.33533, hidden_loss = 0.59455, acc = 0.76293


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 301/600 ] loss = 8.81560, acc = 0.80054


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 302/600 ] loss = 12.38296, hidden_loss = 0.59478, acc = 0.76103


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 302/600 ] loss = 9.04992, acc = 0.79016


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 303/600 ] loss = 12.64750, hidden_loss = 0.59483, acc = 0.76093


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 303/600 ] loss = 8.93113, acc = 0.78723


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 304/600 ] loss = 12.55823, hidden_loss = 0.59595, acc = 0.76343


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 304/600 ] loss = 8.80679, acc = 0.79039


  0%|          | 0/157 [00:00<?, ?it/s]

[ Train | 305/600 ] loss = 12.53923, hidden_loss = 0.59587, acc = 0.75973


  0%|          | 0/70 [00:00<?, ?it/s]

[ Valid | 305/600 ] loss = 8.70591, acc = 0.79964
No improvment 50 consecutive epochs, early stopping
Finish training


In [16]:
# create dataloader for evaluation
eval_set = FoodDataset(os.path.join(cfg['dataset_root'], "evaluation"), tfm=test_tfm)
eval_loader = DataLoader(eval_set, batch_size=cfg['batch_size'], shuffle=False, num_workers=0, pin_memory=True)

One /kaggle/input/ml2023spring-hw13/Food-11/evaluation sample /kaggle/input/ml2023spring-hw13/Food-11/evaluation/0000.jpg


In [17]:
# Load model from {exp_name}/student_best.ckpt
student_model_best = get_student_model() # get a new student model to avoid reference before assignment.
ckpt_path = f"{save_path}/student_best.ckpt" # the ckpt path of the best student model.
student_model_best.load_state_dict(torch.load(ckpt_path, map_location='cpu')) # load the state dict and set it to the student model
student_model_best.to(device) # set the student model to device

# Start evaluate
student_model_best.eval()
eval_preds = [] # storing predictions of the evaluation dataset

# Iterate the validation set by batches.
for batch in tqdm(eval_loader):
    # A batch consists of image data and corresponding labels.
    imgs, _ = batch
    # We don't need gradient in evaluation.
    # Using torch.no_grad() accelerates the forward process.
    with torch.no_grad():
        logits = student_model_best(imgs.to(device))
        preds = list(logits.argmax(dim=-1).squeeze().cpu().numpy())
    # loss and acc can not be calculated because we do not have the true labels of the evaluation set.
    eval_preds += preds

def pad4(i):
    return "0"*(4-len(str(i))) + str(i)

# Save prediction results
ids = [pad4(i) for i in range(0,len(eval_set))]
categories = eval_preds

df = pd.DataFrame()
df['Id'] = ids
df['Category'] = categories
df.to_csv(f"{save_path}/submission.csv", index=False) # now you can download the submission.csv and upload it to the kaggle competition.

  0%|          | 0/35 [00:00<?, ?it/s]