In [1]:
!pip install torchsummary
# Import some useful packages for this homework
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset # "ConcatDataset" and "Subset" are possibly useful
from torchvision.datasets import DatasetFolder, VisionDataset
from torchsummary import summary
from tqdm.auto import tqdm
import random

# !nvidia-smi # list your current GPU

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [2]:
cfg = {
    'dataset_root': '../input/ml2022spring-hw13/food11-hw13',
    'save_dir': './outputs',
    'exp_name': "strong_baseline",
    'batch_size': 128,
    'lr': 1e-3,
    'seed': 20220013,
    'loss_fn_type': 'KD', # simple baseline: CE, medium baseline: KD. See the Knowledge_Distillation part for more information.
    'weight_decay': 1e-5,
    'grad_norm_max': 10,
    'n_epochs': 200, # train more steps to pass the medium baseline.
    'patience': 40,
}

In [3]:
myseed = cfg['seed']  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
random.seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

save_path = os.path.join(cfg['save_dir'], cfg['exp_name']) # create saving directory
os.makedirs(save_path, exist_ok=True)

# define simple logging functionality
log_fw = open(f"{save_path}/log.txt", 'w') # open log file to save log outputs
def log(text):     # define a logging function to trace the training process
    print(text)
    log_fw.write(str(text)+'\n')
    log_fw.flush()

log(cfg)  # log your configs to the log file

{'dataset_root': '../input/ml2022spring-hw13/food11-hw13', 'save_dir': './outputs', 'exp_name': 'strong_baseline', 'batch_size': 128, 'lr': 0.001, 'seed': 20220013, 'loss_fn_type': 'KD', 'weight_decay': 1e-05, 'grad_norm_max': 10, 'n_epochs': 200, 'patience': 40}


In [4]:
# extract the data
#!tar -xzf ./food11-hw13.tar.gz # Could take some time
# !tar -xzvf ./food11-hw13.tar.gz # use this command if you want to checkout the whole process.

In [5]:
for dirname, _, filenames in os.walk('../input/ml2022spring-hw13/food11-hw13'):
    if len(filenames) > 0:
        print(f"{dirname}: {len(filenames)} files.") # Show the file amounts in each split.

../input/ml2022spring-hw13/food11-hw13: 1 files.
../input/ml2022spring-hw13/food11-hw13/validation: 3430 files.
../input/ml2022spring-hw13/food11-hw13/training: 9866 files.
../input/ml2022spring-hw13/food11-hw13/evaluation: 3347 files.


In [6]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# define training/testing transforms
test_tfm = transforms.Compose([
    # It is not encouraged to modify this part if you are using the provided teacher model. This transform is stardard and good enough for testing.
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

train_tfm = transforms.Compose([
    # add some useful transform or augmentation here, according to your experience in HW3.
    transforms.RandomResizedCrop((224, 224), scale=(0.7, 1.0)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.5),
    transforms.RandomRotation(180),
    transforms.RandomAffine(30),
    transforms.ToTensor(),
    normalize,
])

In [7]:
class FoodDataset(Dataset):
    def __init__(self, path, tfm=test_tfm, files = None):
        super().__init__()
        self.path = path
        self.files = sorted([os.path.join(path,x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files != None:
            self.files = files
        print(f"One {path} sample",self.files[0])
        self.transform = tfm
  
    def __len__(self):
        return len(self.files)
  
    def __getitem__(self,idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)
        try:
            label = int(fname.split("/")[-1].split("_")[0])
        except:
            label = -1 # test has no label
        return im,label

In [8]:
# Form train/valid dataloaders
train_set = FoodDataset(os.path.join(cfg['dataset_root'],"training"), tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=cfg['batch_size'], shuffle=True, num_workers=0, pin_memory=True)

valid_set = FoodDataset(os.path.join(cfg['dataset_root'], "validation"), tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=cfg['batch_size'], shuffle=False, num_workers=0, pin_memory=True)

One ../input/ml2022spring-hw13/food11-hw13/training sample ../input/ml2022spring-hw13/food11-hw13/training/0_0.jpg
One ../input/ml2022spring-hw13/food11-hw13/validation sample ../input/ml2022spring-hw13/food11-hw13/validation/0_0.jpg


In [9]:
# Example implementation of Depthwise and Pointwise Convolution 
def dwpw_conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
    return nn.Sequential(
        nn.Conv2d(in_channels, in_channels, kernel_size, stride=stride, padding=padding, groups=in_channels), #depthwise convolution
        nn.Conv2d(in_channels, out_channels, 1), # pointwise convolution
    )

In [10]:
# Define your student network here. You have to copy-paste this code block to HW13 GradeScope before deadline.
# We will use your student network definition to evaluate your results(including the total parameter amount).

def dwpw_conv(ic, oc, kernel_size=3, stride=2, padding=1):
    return nn.Sequential(
        nn.Conv2d(ic, ic, kernel_size, stride=stride, padding=padding, groups=ic), #depthwise convolution
        nn.BatchNorm2d(ic),
        nn.LeakyReLU(0.01, inplace=True),
        nn.Conv2d(ic, oc, 1), # pointwise convolution
        nn.BatchNorm2d(oc),
        nn.LeakyReLU(0.01, inplace=True)
    )
            
class StudentNet(nn.Module):
    def __init__(self):
        super().__init__()

          # ---------- TODO ----------
       # Modify your model architecture
       # 224 --> 112
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
     
        self.layer1 = dwpw_conv(64, 64, stride=1) 
        self.layer2 = dwpw_conv(64, 128)
        self.layer3 = dwpw_conv(128, 256) 
        self.layer4 = dwpw_conv(256, 140) 
        # Here we adopt Global Average Pooling for various input size.
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(140, 11)
      
    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avgpool(out)
        out = out.flatten(1)
        out = self.fc(out)
        return out

def get_student_model(): # This function should have no arguments so that we can get your student network by directly calling it.
    # you can modify or do anything here, just remember to return an nn.Module as your student network.  
    return StudentNet() 

# End of definition of your student model and the get_student_model API
# Please copy-paste the whole code block, including the get_student_model function.

In [11]:
# DO NOT modify this block and please make sure that this block can run sucessfully. 
student_model = get_student_model()
summary(student_model, (3, 224, 224), device='cpu')
# You have to copy&paste the results of this block to HW13 GradeScope. 

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]             640
       BatchNorm2d-6           [-1, 64, 56, 56]             128
         LeakyReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]           4,160
       BatchNorm2d-9           [-1, 64, 56, 56]             128
        LeakyReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11           [-1, 64, 28, 28]             640
      BatchNorm2d-12           [-1, 64, 28, 28]             128
        LeakyReLU-13           [-1, 64, 28, 28]               0
           Conv2d-14          [-1, 128,

In [12]:
# Load provided teacher model (model architecture: resnet18, num_classes=11, test-acc ~= 89.9%)
teacher_model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False, num_classes=11)
# load state dict
teacher_ckpt_path = os.path.join(cfg['dataset_root'], "resnet18_teacher.ckpt")
teacher_model.load_state_dict(torch.load(teacher_ckpt_path, map_location='cpu'))
# Now you already know the teacher model's architecture. You can take advantage of it if you want to pass the strong or boss baseline. 
# Source code of resnet in pytorch: (https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py)
# You can also see the summary of teacher model. There are 11,182,155 parameters totally in the teacher model
summary(teacher_model, (3, 224, 224), device='cpu')

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

In [13]:
def use_pretrain():
    student_model.conv1.weight = teacher_model.conv1.weight
    student_model.bn1.weight = teacher_model.bn1.weight
    student_model.bn1.bias = teacher_model.bn1.bias
    student_model.bn1.running_mean = teacher_model.bn1.running_mean
    student_model.bn1.running_var = teacher_model.bn1.running_var
    student_model.conv1.weight.requires_grad = False
    student_model.bn1.weight.requires_grad = False
    student_model.bn1.bias.requires_grad = False
use_pretrain()


class HookTool:
    def __init__(self):
        self.fea = None
    def hook_fun(self, module, fea_in, fea_out):
        self.fea = fea_out
        
def get_feas_by_hook(model, names=['layer1', 'layer2', 'layer3']):
    fea_hooks = []
    for name, module in model.named_modules():
        if name in names:
            cur_hook = HookTool()
            module.register_forward_hook(cur_hook.hook_fun)
            fea_hooks.append(cur_hook)
    return fea_hooks
fea_hooks_teacher = get_feas_by_hook(teacher_model)
fea_hooks_student = get_feas_by_hook(student_model)

def loss_fea_layers(student, teacher):
    loss  = 0
    for i in range(len(student)):
        #loss += (len(student) - i)* (student[i].fea - teacher[i].fea).norm(2, [1, 2, 3]).mean()
        loss += (len(student) - i) * F.smooth_l1_loss(student[i].fea, teacher[i].fea)
    return loss

In [14]:
# Implement the loss function with KL divergence loss for knowledge distillation.
# You also have to copy-paste this whole block to HW13 GradeScope. 
CE = nn.CrossEntropyLoss()
def loss_fn_kd(student_logits, labels, teacher_logits, alpha=0.5, temperature=20.0):
    # ------------TODO-------------
    # Refer to the above formula and finish the loss function for knowkedge distillation using KL divergence loss and CE loss.
    # If you have no idea, please take a look at the provided useful link above.
    student_T = (student_logits/temperature).softmax(dim=-1)
    teacher_T = (teacher_logits/temperature).softmax(dim=-1)
    kl_loss = (teacher_T*(teacher_T.log() - student_T.log())).sum(1).mean() 
    ce_loss = CE(student_logits, labels)
    return alpha*(temperature**2)*kl_loss + (1 - alpha)*ce_loss

In [15]:
# choose the loss function by the config
if cfg['loss_fn_type'] == 'CE':
    # For the classification task, we use cross-entropy as the default loss function.
    loss_fn = nn.CrossEntropyLoss() # loss function for simple baseline.

if cfg['loss_fn_type'] == 'KD': # KD stands for knowledge distillation
    loss_fn = loss_fn_kd # implement loss_fn_kd for the report question and the medium baseline.

# You can also adopt other types of knowledge distillation techniques for strong and boss baseline, but use function name other than `loss_fn_kd`
# For example:
# def loss_fn_custom_kd():
#     pass
# if cfg['loss_fn_type'] == 'custom_kd':
#     loss_fn = loss_fn_custom_kd

# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"
log(f"device: {device}")

# The number of training epochs and patience.
n_epochs = cfg['n_epochs']
patience = cfg['patience'] # If no improvement in 'patience' epochs, early stop

device: cuda


In [16]:
# Initialize a model, and put it on the device specified.
student_model.to(device)
teacher_model.to(device) # MEDIUM BASELINE

# Initialize optimizer, you may fine-tune some hyperparameters such as learning rate on your own.
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, student_model.parameters()), lr=cfg['lr'], weight_decay=cfg['weight_decay']) 
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=6, T_mult=2, eta_min=1e-5)

# Initialize trackers, these are not parameters and should not be changed
stale = 0
best_acc = 0.0

teacher_model.eval()  # MEDIUM BASELINE
for epoch in range(n_epochs):

    # ---------- Training ----------
    # Make sure the model is in train mode before training.
    student_model.train()

    # These are used to record information in training.
    train_loss = []
    train_loss_fea = []
    train_accs = []
    train_lens = []
    percent = (1+epoch)/n_epochs

    for batch in tqdm(train_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)
        #imgs = imgs.half()
        #print(imgs.shape,labels.shape)

        # Forward the data. (Make sure data and model are on the same device.)
        with torch.no_grad():  # MEDIUM BASELINE
            teacher_logits = teacher_model(imgs)  # MEDIUM BASELINE
        
        logits = student_model(imgs)

        # Calculate the cross-entropy loss.
        # We don't need to apply softmax before computing cross-entropy as it is done automatically.
        loss_logits = loss_fn(logits, labels, teacher_logits, alpha=1 - percent*percent) # MEDIUM BASELINE
        loss_fea = loss_fea_layers(fea_hooks_student, fea_hooks_teacher)
        loss = (10*percent*percent) * loss_logits + loss_fea
        #loss = loss_logits
        #loss = loss_fn(logits, labels) # SIMPLE BASELINE
        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        # Compute the gradients for parameters.
        loss.backward()

        # Clip the gradient norms for stable training.
        grad_norm = nn.utils.clip_grad_norm_(student_model.parameters(), max_norm=cfg['grad_norm_max'])

        # Update the parameters with computed gradients.
        optimizer.step()
        
        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels).float().sum()

        # Record the loss and accuracy.
        train_batch_len = len(imgs)
        train_loss.append(loss_logits.item() * train_batch_len)
        train_loss_fea.append(loss_fea.item()* train_batch_len)
        train_accs.append(acc)
        train_lens.append(train_batch_len)
    
    #scheduler.step()    
    train_loss = sum(train_loss) / sum(train_lens)
    train_loss_fea = sum(train_loss_fea) / sum(train_lens)
    train_acc = sum(train_accs) / sum(train_lens)

    # Print the information.
    log(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, loss_fea = {train_loss_fea:.5f}, acc = {train_acc:.5f}")

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    student_model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []
    valid_lens = []

    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)

        # We don't need gradient in validation.
        # Using torch.no_grad() accelerates the forward process.
        with torch.no_grad():
            logits = student_model(imgs)
            teacher_logits = teacher_model(imgs) # MEDIUM BASELINE

        # We can still compute the loss (but not the gradient).
        loss = loss_fn(logits, labels, teacher_logits) # MEDIUM BASELINE
        #loss = loss_fn(logits, labels) # SIMPLE BASELINE

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels).float().sum()

        # Record the loss and accuracy.
        batch_len = len(imgs)
        valid_loss.append(loss.item() * batch_len)
        valid_accs.append(acc)
        valid_lens.append(batch_len)
        #break

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / sum(valid_lens)
    valid_acc = sum(valid_accs) / sum(valid_lens)

    # update logs
    
    if valid_acc > best_acc:
        log(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} ---------------------> best")
    else:
        log(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # save models
    if valid_acc > best_acc:
        log(f"Best model found at epoch {epoch+1}, saving model")
        torch.save(student_model.state_dict(), f"{save_path}/student_best.ckpt") # only save best to prevent output memory exceed error
        best_acc = valid_acc
        stale = 0
    else:
        stale += 1
        if stale > patience:
            log(f"No improvment {patience} consecutive epochs, early stopping")
            break
log("Finish training")
log_fw.close()

  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 001/200 ] loss = 18.43655, loss_fea = 3.52866, acc = 0.24296


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 001/200 ] loss = 17.03728, acc = 0.25714 ---------------------> best
Best model found at epoch 1, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 002/200 ] loss = 16.45703, loss_fea = 2.91248, acc = 0.31918


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 002/200 ] loss = 15.68271, acc = 0.31953 ---------------------> best
Best model found at epoch 2, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 003/200 ] loss = 15.07348, loss_fea = 2.55323, acc = 0.38253


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 003/200 ] loss = 13.93445, acc = 0.41429 ---------------------> best
Best model found at epoch 3, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 004/200 ] loss = 13.56667, loss_fea = 2.29817, acc = 0.44557


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 004/200 ] loss = 13.00247, acc = 0.48513 ---------------------> best
Best model found at epoch 4, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 005/200 ] loss = 12.39211, loss_fea = 2.11343, acc = 0.47740


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 005/200 ] loss = 12.10466, acc = 0.47930


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 006/200 ] loss = 11.69553, loss_fea = 1.97673, acc = 0.51571


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 006/200 ] loss = 12.12562, acc = 0.51516 ---------------------> best
Best model found at epoch 6, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 007/200 ] loss = 11.03563, loss_fea = 1.87376, acc = 0.53497


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 007/200 ] loss = 11.24042, acc = 0.49067


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 008/200 ] loss = 10.44254, loss_fea = 1.79473, acc = 0.55656


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 008/200 ] loss = 10.48037, acc = 0.54927 ---------------------> best
Best model found at epoch 8, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 009/200 ] loss = 10.19049, loss_fea = 1.73391, acc = 0.56902


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 009/200 ] loss = 10.14838, acc = 0.54461


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 010/200 ] loss = 9.82854, loss_fea = 1.68568, acc = 0.58717


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 010/200 ] loss = 10.81438, acc = 0.53149


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 011/200 ] loss = 9.50132, loss_fea = 1.64861, acc = 0.59923


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 011/200 ] loss = 9.74222, acc = 0.55831 ---------------------> best
Best model found at epoch 11, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 012/200 ] loss = 9.10571, loss_fea = 1.61995, acc = 0.61362


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 012/200 ] loss = 9.26113, acc = 0.58017 ---------------------> best
Best model found at epoch 12, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 013/200 ] loss = 8.93669, loss_fea = 1.59849, acc = 0.61899


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 013/200 ] loss = 9.43885, acc = 0.56268


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 014/200 ] loss = 8.71604, loss_fea = 1.58269, acc = 0.62761


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 014/200 ] loss = 9.26794, acc = 0.59475 ---------------------> best
Best model found at epoch 14, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 015/200 ] loss = 8.56172, loss_fea = 1.57087, acc = 0.63663


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 015/200 ] loss = 9.19762, acc = 0.60758 ---------------------> best
Best model found at epoch 15, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 016/200 ] loss = 8.34373, loss_fea = 1.56259, acc = 0.64433


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 016/200 ] loss = 8.63793, acc = 0.61837 ---------------------> best
Best model found at epoch 16, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 017/200 ] loss = 8.23343, loss_fea = 1.55668, acc = 0.64940


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 017/200 ] loss = 8.55842, acc = 0.61224


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 018/200 ] loss = 7.94900, loss_fea = 1.55140, acc = 0.65640


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 018/200 ] loss = 9.18556, acc = 0.57405


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 019/200 ] loss = 7.87903, loss_fea = 1.54898, acc = 0.66065


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 019/200 ] loss = 7.91665, acc = 0.65306 ---------------------> best
Best model found at epoch 19, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 020/200 ] loss = 7.78703, loss_fea = 1.54813, acc = 0.66978


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 020/200 ] loss = 8.23960, acc = 0.63732


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 021/200 ] loss = 7.61891, loss_fea = 1.54729, acc = 0.67160


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 021/200 ] loss = 9.49280, acc = 0.57143


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 022/200 ] loss = 7.55077, loss_fea = 1.54852, acc = 0.67667


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 022/200 ] loss = 7.68384, acc = 0.65335 ---------------------> best
Best model found at epoch 22, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 023/200 ] loss = 7.45504, loss_fea = 1.55152, acc = 0.67728


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 023/200 ] loss = 7.99037, acc = 0.62653


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 024/200 ] loss = 7.33867, loss_fea = 1.55259, acc = 0.67880


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 024/200 ] loss = 7.52337, acc = 0.66327 ---------------------> best
Best model found at epoch 24, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 025/200 ] loss = 7.15967, loss_fea = 1.55263, acc = 0.68589


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 025/200 ] loss = 7.98693, acc = 0.66356 ---------------------> best
Best model found at epoch 25, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 026/200 ] loss = 7.26198, loss_fea = 1.55733, acc = 0.68630


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 026/200 ] loss = 7.29101, acc = 0.67930 ---------------------> best
Best model found at epoch 26, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 027/200 ] loss = 7.11256, loss_fea = 1.55784, acc = 0.69380


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 027/200 ] loss = 7.47364, acc = 0.65802


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 028/200 ] loss = 6.94020, loss_fea = 1.55953, acc = 0.70282


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 028/200 ] loss = 7.71976, acc = 0.65656


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 029/200 ] loss = 6.88044, loss_fea = 1.56292, acc = 0.70343


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 029/200 ] loss = 7.37169, acc = 0.66064


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 030/200 ] loss = 6.65254, loss_fea = 1.56505, acc = 0.70728


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 030/200 ] loss = 7.09019, acc = 0.68921 ---------------------> best
Best model found at epoch 30, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 031/200 ] loss = 6.71915, loss_fea = 1.56743, acc = 0.71245


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 031/200 ] loss = 7.14011, acc = 0.67697


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 032/200 ] loss = 6.58470, loss_fea = 1.56985, acc = 0.71417


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 032/200 ] loss = 6.88878, acc = 0.68455


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 033/200 ] loss = 6.57985, loss_fea = 1.57280, acc = 0.71498


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 033/200 ] loss = 6.95657, acc = 0.68017


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 034/200 ] loss = 6.50759, loss_fea = 1.57487, acc = 0.71812


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 034/200 ] loss = 6.95639, acc = 0.68396


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 035/200 ] loss = 6.43810, loss_fea = 1.57809, acc = 0.71995


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 035/200 ] loss = 6.94867, acc = 0.69504 ---------------------> best
Best model found at epoch 35, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 036/200 ] loss = 6.32539, loss_fea = 1.58144, acc = 0.71954


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 036/200 ] loss = 6.96509, acc = 0.67901


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 037/200 ] loss = 6.31440, loss_fea = 1.58258, acc = 0.72481


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 037/200 ] loss = 6.56282, acc = 0.70583 ---------------------> best
Best model found at epoch 37, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 038/200 ] loss = 6.21244, loss_fea = 1.58723, acc = 0.72380


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 038/200 ] loss = 6.86368, acc = 0.66968


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 039/200 ] loss = 6.10370, loss_fea = 1.59187, acc = 0.73181


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 039/200 ] loss = 7.05027, acc = 0.66706


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 040/200 ] loss = 6.04953, loss_fea = 1.59479, acc = 0.73059


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 040/200 ] loss = 7.09155, acc = 0.67813


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 041/200 ] loss = 6.08529, loss_fea = 1.59706, acc = 0.72806


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 041/200 ] loss = 6.62216, acc = 0.70292


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 042/200 ] loss = 5.98646, loss_fea = 1.60113, acc = 0.74022


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 042/200 ] loss = 6.25951, acc = 0.71633 ---------------------> best
Best model found at epoch 42, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 043/200 ] loss = 5.92933, loss_fea = 1.60417, acc = 0.73485


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 043/200 ] loss = 6.56486, acc = 0.70554


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 044/200 ] loss = 5.90831, loss_fea = 1.60718, acc = 0.73971


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 044/200 ] loss = 6.34020, acc = 0.72012 ---------------------> best
Best model found at epoch 44, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 045/200 ] loss = 5.83362, loss_fea = 1.61086, acc = 0.73839


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 045/200 ] loss = 6.34698, acc = 0.71691


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 046/200 ] loss = 5.76630, loss_fea = 1.61429, acc = 0.74133


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 046/200 ] loss = 6.29565, acc = 0.71429


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 047/200 ] loss = 5.70763, loss_fea = 1.61617, acc = 0.74407


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 047/200 ] loss = 6.48068, acc = 0.69504


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 048/200 ] loss = 5.68990, loss_fea = 1.61750, acc = 0.74802


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 048/200 ] loss = 6.30619, acc = 0.70175


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 049/200 ] loss = 5.71499, loss_fea = 1.62373, acc = 0.74721


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 049/200 ] loss = 6.25685, acc = 0.70875


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 050/200 ] loss = 5.61986, loss_fea = 1.62538, acc = 0.75451


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 050/200 ] loss = 6.58479, acc = 0.69475


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 051/200 ] loss = 5.53521, loss_fea = 1.62863, acc = 0.74863


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 051/200 ] loss = 6.26019, acc = 0.70146


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 052/200 ] loss = 5.51032, loss_fea = 1.63072, acc = 0.75117


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 052/200 ] loss = 6.02143, acc = 0.71691


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 053/200 ] loss = 5.46493, loss_fea = 1.63602, acc = 0.74660


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 053/200 ] loss = 6.19907, acc = 0.70496


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 054/200 ] loss = 5.40441, loss_fea = 1.63847, acc = 0.75532


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 054/200 ] loss = 6.53592, acc = 0.71691


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 055/200 ] loss = 5.36582, loss_fea = 1.64279, acc = 0.75664


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 055/200 ] loss = 6.16890, acc = 0.69883


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 056/200 ] loss = 5.28076, loss_fea = 1.64532, acc = 0.75654


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 056/200 ] loss = 5.65772, acc = 0.74402 ---------------------> best
Best model found at epoch 56, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 057/200 ] loss = 5.25348, loss_fea = 1.64885, acc = 0.76029


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 057/200 ] loss = 5.89981, acc = 0.71137


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 058/200 ] loss = 5.23226, loss_fea = 1.65204, acc = 0.76029


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 058/200 ] loss = 6.08829, acc = 0.69621


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 059/200 ] loss = 5.27899, loss_fea = 1.65355, acc = 0.76150


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 059/200 ] loss = 6.41045, acc = 0.71050


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 060/200 ] loss = 5.16709, loss_fea = 1.65663, acc = 0.76069


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 060/200 ] loss = 6.33521, acc = 0.69883


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 061/200 ] loss = 5.12455, loss_fea = 1.66067, acc = 0.76444


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 061/200 ] loss = 5.77596, acc = 0.72449


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 062/200 ] loss = 5.17842, loss_fea = 1.66339, acc = 0.76201


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 062/200 ] loss = 5.92726, acc = 0.72741


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 063/200 ] loss = 5.08476, loss_fea = 1.66615, acc = 0.76454


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 063/200 ] loss = 5.82129, acc = 0.73324


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 064/200 ] loss = 4.98711, loss_fea = 1.66834, acc = 0.76708


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 064/200 ] loss = 5.74662, acc = 0.71953


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 065/200 ] loss = 5.05334, loss_fea = 1.67150, acc = 0.77113


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 065/200 ] loss = 5.59440, acc = 0.73090


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 066/200 ] loss = 4.87379, loss_fea = 1.67184, acc = 0.76596


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 066/200 ] loss = 5.99800, acc = 0.72741


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 067/200 ] loss = 4.90953, loss_fea = 1.67608, acc = 0.76779


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 067/200 ] loss = 5.75539, acc = 0.74227


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 068/200 ] loss = 4.86548, loss_fea = 1.67923, acc = 0.77428


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 068/200 ] loss = 5.67483, acc = 0.73469


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 069/200 ] loss = 4.83780, loss_fea = 1.68344, acc = 0.77559


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 069/200 ] loss = 6.30951, acc = 0.69184


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 070/200 ] loss = 4.78836, loss_fea = 1.68512, acc = 0.77661


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 070/200 ] loss = 5.82314, acc = 0.73586


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 071/200 ] loss = 4.77235, loss_fea = 1.68540, acc = 0.77265


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 071/200 ] loss = 5.62653, acc = 0.74985 ---------------------> best
Best model found at epoch 71, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 072/200 ] loss = 4.68896, loss_fea = 1.68925, acc = 0.78107


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 072/200 ] loss = 5.66671, acc = 0.73907


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 073/200 ] loss = 4.70924, loss_fea = 1.69210, acc = 0.78137


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 073/200 ] loss = 6.13796, acc = 0.73703


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 074/200 ] loss = 4.64841, loss_fea = 1.69571, acc = 0.77742


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 074/200 ] loss = 5.74556, acc = 0.72332


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 075/200 ] loss = 4.65402, loss_fea = 1.69750, acc = 0.78269


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 075/200 ] loss = 6.03809, acc = 0.72741


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 076/200 ] loss = 4.56083, loss_fea = 1.69934, acc = 0.78593


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 076/200 ] loss = 5.53305, acc = 0.74461


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 077/200 ] loss = 4.53136, loss_fea = 1.70018, acc = 0.78076


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 077/200 ] loss = 5.58555, acc = 0.74169


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 078/200 ] loss = 4.51763, loss_fea = 1.70233, acc = 0.78634


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 078/200 ] loss = 5.32850, acc = 0.76589 ---------------------> best
Best model found at epoch 78, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 079/200 ] loss = 4.51466, loss_fea = 1.70548, acc = 0.78178


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 079/200 ] loss = 5.66291, acc = 0.74257


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 080/200 ] loss = 4.45131, loss_fea = 1.70879, acc = 0.78461


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 080/200 ] loss = 5.27612, acc = 0.76268


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 081/200 ] loss = 4.36579, loss_fea = 1.71198, acc = 0.78735


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 081/200 ] loss = 5.40904, acc = 0.74257


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 082/200 ] loss = 4.32840, loss_fea = 1.71406, acc = 0.78664


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 082/200 ] loss = 5.66707, acc = 0.72362


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 083/200 ] loss = 4.35228, loss_fea = 1.71469, acc = 0.78603


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 083/200 ] loss = 5.28561, acc = 0.75335


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 084/200 ] loss = 4.31692, loss_fea = 1.71887, acc = 0.79627


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 084/200 ] loss = 5.81735, acc = 0.71341


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 085/200 ] loss = 4.26388, loss_fea = 1.72082, acc = 0.79151


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 085/200 ] loss = 5.29168, acc = 0.75948


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 086/200 ] loss = 4.21599, loss_fea = 1.72064, acc = 0.79120


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 086/200 ] loss = 5.36466, acc = 0.75627


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 087/200 ] loss = 4.20805, loss_fea = 1.72092, acc = 0.79019


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 087/200 ] loss = 5.36299, acc = 0.75510


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 088/200 ] loss = 4.23045, loss_fea = 1.72537, acc = 0.79211


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 088/200 ] loss = 5.21344, acc = 0.76472


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 089/200 ] loss = 4.14799, loss_fea = 1.72787, acc = 0.79485


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 089/200 ] loss = 5.45430, acc = 0.75044


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 090/200 ] loss = 4.10731, loss_fea = 1.72982, acc = 0.79536


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 090/200 ] loss = 5.36848, acc = 0.74956


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 091/200 ] loss = 4.14350, loss_fea = 1.73187, acc = 0.79556


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 091/200 ] loss = 5.14002, acc = 0.76764 ---------------------> best
Best model found at epoch 91, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 092/200 ] loss = 4.07123, loss_fea = 1.73428, acc = 0.79607


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 092/200 ] loss = 5.57484, acc = 0.74111


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 093/200 ] loss = 4.03810, loss_fea = 1.73626, acc = 0.79627


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 093/200 ] loss = 5.32142, acc = 0.74257


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 094/200 ] loss = 4.00573, loss_fea = 1.73788, acc = 0.79830


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 094/200 ] loss = 5.40658, acc = 0.76297


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 095/200 ] loss = 3.99000, loss_fea = 1.74093, acc = 0.79627


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 095/200 ] loss = 5.82231, acc = 0.74315


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 096/200 ] loss = 3.92136, loss_fea = 1.74309, acc = 0.80225


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 096/200 ] loss = 5.50531, acc = 0.75073


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 097/200 ] loss = 3.91553, loss_fea = 1.74444, acc = 0.80641


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 097/200 ] loss = 5.04572, acc = 0.75569


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 098/200 ] loss = 3.82872, loss_fea = 1.74751, acc = 0.80590


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 098/200 ] loss = 5.21278, acc = 0.75831


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 099/200 ] loss = 3.85938, loss_fea = 1.75059, acc = 0.80681


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 099/200 ] loss = 5.10071, acc = 0.76531


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 100/200 ] loss = 3.83719, loss_fea = 1.75299, acc = 0.79799


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 100/200 ] loss = 5.11412, acc = 0.76647


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 101/200 ] loss = 3.81147, loss_fea = 1.75414, acc = 0.80266


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 101/200 ] loss = 5.38573, acc = 0.75831


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 102/200 ] loss = 3.78566, loss_fea = 1.75782, acc = 0.80762


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 102/200 ] loss = 5.10587, acc = 0.77464 ---------------------> best
Best model found at epoch 102, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 103/200 ] loss = 3.75051, loss_fea = 1.76058, acc = 0.80053


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 103/200 ] loss = 5.17451, acc = 0.74840


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 104/200 ] loss = 3.72808, loss_fea = 1.76176, acc = 0.80235


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 104/200 ] loss = 5.11870, acc = 0.76676


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 105/200 ] loss = 3.70732, loss_fea = 1.76252, acc = 0.80945


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 105/200 ] loss = 5.23172, acc = 0.75656


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 106/200 ] loss = 3.63914, loss_fea = 1.76581, acc = 0.80377


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 106/200 ] loss = 5.40695, acc = 0.76939


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 107/200 ] loss = 3.61179, loss_fea = 1.76853, acc = 0.80519


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 107/200 ] loss = 5.27706, acc = 0.76589


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 108/200 ] loss = 3.63266, loss_fea = 1.76991, acc = 0.81097


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 108/200 ] loss = 5.39410, acc = 0.76618


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 109/200 ] loss = 3.55994, loss_fea = 1.77022, acc = 0.80559


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 109/200 ] loss = 5.02886, acc = 0.76647


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 110/200 ] loss = 3.52785, loss_fea = 1.77307, acc = 0.81188


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 110/200 ] loss = 5.16637, acc = 0.76968


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 111/200 ] loss = 3.48465, loss_fea = 1.77592, acc = 0.81087


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 111/200 ] loss = 4.93737, acc = 0.77026


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 112/200 ] loss = 3.48176, loss_fea = 1.77988, acc = 0.80407


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 112/200 ] loss = 5.06531, acc = 0.77172


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 113/200 ] loss = 3.46039, loss_fea = 1.77955, acc = 0.80914


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 113/200 ] loss = 5.26171, acc = 0.77376


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 114/200 ] loss = 3.41125, loss_fea = 1.78051, acc = 0.81441


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 114/200 ] loss = 5.39703, acc = 0.73907


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 115/200 ] loss = 3.35481, loss_fea = 1.78125, acc = 0.81421


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 115/200 ] loss = 5.47880, acc = 0.72653


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 116/200 ] loss = 3.34509, loss_fea = 1.78478, acc = 0.81168


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 116/200 ] loss = 5.33599, acc = 0.76764


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 117/200 ] loss = 3.33464, loss_fea = 1.78574, acc = 0.81036


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 117/200 ] loss = 5.17734, acc = 0.76793


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 118/200 ] loss = 3.28762, loss_fea = 1.78817, acc = 0.81421


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 118/200 ] loss = 5.06820, acc = 0.76997


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 119/200 ] loss = 3.26110, loss_fea = 1.78945, acc = 0.81492


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 119/200 ] loss = 5.33022, acc = 0.76268


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 120/200 ] loss = 3.21881, loss_fea = 1.79147, acc = 0.81614


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 120/200 ] loss = 4.99897, acc = 0.75889


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 121/200 ] loss = 3.21419, loss_fea = 1.79346, acc = 0.81715


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 121/200 ] loss = 5.21995, acc = 0.74636


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 122/200 ] loss = 3.16814, loss_fea = 1.79374, acc = 0.81847


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 122/200 ] loss = 5.05493, acc = 0.76356


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 123/200 ] loss = 3.17072, loss_fea = 1.79580, acc = 0.81228


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 123/200 ] loss = 4.99782, acc = 0.77930 ---------------------> best
Best model found at epoch 123, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 124/200 ] loss = 3.09548, loss_fea = 1.79899, acc = 0.82232


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 124/200 ] loss = 5.07727, acc = 0.77872


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 125/200 ] loss = 3.06736, loss_fea = 1.80066, acc = 0.81857


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 125/200 ] loss = 5.26462, acc = 0.77055


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 126/200 ] loss = 3.00483, loss_fea = 1.80382, acc = 0.81938


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 126/200 ] loss = 5.02645, acc = 0.77201


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 127/200 ] loss = 3.01874, loss_fea = 1.80488, acc = 0.82465


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 127/200 ] loss = 4.86450, acc = 0.78222 ---------------------> best
Best model found at epoch 127, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 128/200 ] loss = 2.98422, loss_fea = 1.80940, acc = 0.82536


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 128/200 ] loss = 5.00272, acc = 0.77959


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 129/200 ] loss = 2.97179, loss_fea = 1.80997, acc = 0.82242


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 129/200 ] loss = 5.18593, acc = 0.76064


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 130/200 ] loss = 2.88975, loss_fea = 1.80959, acc = 0.82526


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 130/200 ] loss = 5.17929, acc = 0.77959


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 131/200 ] loss = 2.89133, loss_fea = 1.81201, acc = 0.82587


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 131/200 ] loss = 5.17268, acc = 0.76939


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 132/200 ] loss = 2.87355, loss_fea = 1.81274, acc = 0.82141


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 132/200 ] loss = 5.05193, acc = 0.77464


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 133/200 ] loss = 2.84448, loss_fea = 1.81699, acc = 0.82546


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 133/200 ] loss = 4.94166, acc = 0.76997


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 134/200 ] loss = 2.80668, loss_fea = 1.81657, acc = 0.82941


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 134/200 ] loss = 5.12270, acc = 0.78076


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 135/200 ] loss = 2.76862, loss_fea = 1.81907, acc = 0.82678


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 135/200 ] loss = 5.04370, acc = 0.77405


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 136/200 ] loss = 2.74169, loss_fea = 1.82042, acc = 0.82637


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 136/200 ] loss = 5.07422, acc = 0.76589


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 137/200 ] loss = 2.72283, loss_fea = 1.82402, acc = 0.83175


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 137/200 ] loss = 4.84403, acc = 0.79213 ---------------------> best
Best model found at epoch 137, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 138/200 ] loss = 2.65726, loss_fea = 1.82578, acc = 0.83408


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 138/200 ] loss = 5.01499, acc = 0.78717


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 139/200 ] loss = 2.64321, loss_fea = 1.82792, acc = 0.82495


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 139/200 ] loss = 5.05379, acc = 0.77784


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 140/200 ] loss = 2.64292, loss_fea = 1.82942, acc = 0.83266


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 140/200 ] loss = 4.92657, acc = 0.78455


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 141/200 ] loss = 2.56683, loss_fea = 1.83051, acc = 0.83073


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 141/200 ] loss = 5.12333, acc = 0.76239


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 142/200 ] loss = 2.53068, loss_fea = 1.83198, acc = 0.83073


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 142/200 ] loss = 5.09824, acc = 0.77172


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 143/200 ] loss = 2.49474, loss_fea = 1.83296, acc = 0.83580


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 143/200 ] loss = 5.02317, acc = 0.77376


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 144/200 ] loss = 2.46560, loss_fea = 1.83532, acc = 0.83093


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 144/200 ] loss = 4.97358, acc = 0.76997


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 145/200 ] loss = 2.43227, loss_fea = 1.83850, acc = 0.83702


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 145/200 ] loss = 5.27990, acc = 0.78630


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 146/200 ] loss = 2.42443, loss_fea = 1.83913, acc = 0.82972


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 146/200 ] loss = 4.98349, acc = 0.77493


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 147/200 ] loss = 2.36059, loss_fea = 1.83769, acc = 0.83773


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 147/200 ] loss = 4.99202, acc = 0.79125


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 148/200 ] loss = 2.33849, loss_fea = 1.83981, acc = 0.83823


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 148/200 ] loss = 5.09682, acc = 0.77318


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 149/200 ] loss = 2.29067, loss_fea = 1.84194, acc = 0.83844


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 149/200 ] loss = 5.01649, acc = 0.77697


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 150/200 ] loss = 2.27759, loss_fea = 1.84303, acc = 0.84431


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 150/200 ] loss = 4.90565, acc = 0.78309


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 151/200 ] loss = 2.24564, loss_fea = 1.84326, acc = 0.83945


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 151/200 ] loss = 4.77308, acc = 0.79388 ---------------------> best
Best model found at epoch 151, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 152/200 ] loss = 2.18846, loss_fea = 1.84570, acc = 0.84016


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 152/200 ] loss = 4.92243, acc = 0.78746


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 153/200 ] loss = 2.18371, loss_fea = 1.84673, acc = 0.84036


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 153/200 ] loss = 4.97885, acc = 0.76910


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 154/200 ] loss = 2.14230, loss_fea = 1.84559, acc = 0.84310


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 154/200 ] loss = 5.08941, acc = 0.78834


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 155/200 ] loss = 2.11008, loss_fea = 1.84869, acc = 0.84654


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 155/200 ] loss = 4.88971, acc = 0.78571


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 156/200 ] loss = 2.07632, loss_fea = 1.84936, acc = 0.84482


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 156/200 ] loss = 5.11475, acc = 0.76618


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 157/200 ] loss = 2.05926, loss_fea = 1.85151, acc = 0.84188


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 157/200 ] loss = 4.94709, acc = 0.79942 ---------------------> best
Best model found at epoch 157, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 158/200 ] loss = 2.01660, loss_fea = 1.85424, acc = 0.84898


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 158/200 ] loss = 5.02761, acc = 0.76997


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 159/200 ] loss = 1.94040, loss_fea = 1.85288, acc = 0.84665


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 159/200 ] loss = 4.98795, acc = 0.77843


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 160/200 ] loss = 1.96181, loss_fea = 1.85502, acc = 0.84360


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 160/200 ] loss = 5.12186, acc = 0.75627


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 161/200 ] loss = 1.88906, loss_fea = 1.85582, acc = 0.84817


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 161/200 ] loss = 4.93834, acc = 0.78163


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 162/200 ] loss = 1.85896, loss_fea = 1.85727, acc = 0.85252


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 162/200 ] loss = 5.08405, acc = 0.78134


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 163/200 ] loss = 1.84653, loss_fea = 1.85945, acc = 0.84958


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 163/200 ] loss = 5.17671, acc = 0.77522


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 164/200 ] loss = 1.78534, loss_fea = 1.85940, acc = 0.85415


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 164/200 ] loss = 4.80004, acc = 0.78542


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 165/200 ] loss = 1.78604, loss_fea = 1.86143, acc = 0.84887


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 165/200 ] loss = 5.21777, acc = 0.77464


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 166/200 ] loss = 1.72196, loss_fea = 1.86060, acc = 0.84948


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 166/200 ] loss = 4.87387, acc = 0.79009


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 167/200 ] loss = 1.69469, loss_fea = 1.86472, acc = 0.85475


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 167/200 ] loss = 5.19720, acc = 0.77668


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 168/200 ] loss = 1.67320, loss_fea = 1.86481, acc = 0.85283


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 168/200 ] loss = 5.29325, acc = 0.76676


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 169/200 ] loss = 1.62483, loss_fea = 1.86483, acc = 0.85283


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 169/200 ] loss = 5.00262, acc = 0.78630


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 170/200 ] loss = 1.58616, loss_fea = 1.86673, acc = 0.86084


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 170/200 ] loss = 4.96356, acc = 0.78805


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 171/200 ] loss = 1.53495, loss_fea = 1.86846, acc = 0.85972


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 171/200 ] loss = 4.92789, acc = 0.79446


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 172/200 ] loss = 1.52883, loss_fea = 1.87138, acc = 0.85344


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 172/200 ] loss = 4.84363, acc = 0.78338


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 173/200 ] loss = 1.48381, loss_fea = 1.87304, acc = 0.86195


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 173/200 ] loss = 5.01619, acc = 0.78892


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 174/200 ] loss = 1.42032, loss_fea = 1.87296, acc = 0.85992


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 174/200 ] loss = 5.02188, acc = 0.78222


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 175/200 ] loss = 1.39749, loss_fea = 1.87440, acc = 0.86347


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 175/200 ] loss = 5.07983, acc = 0.76939


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 176/200 ] loss = 1.38334, loss_fea = 1.87409, acc = 0.85658


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 176/200 ] loss = 5.18693, acc = 0.77493


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 177/200 ] loss = 1.31311, loss_fea = 1.87526, acc = 0.86651


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 177/200 ] loss = 4.91938, acc = 0.78367


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 178/200 ] loss = 1.29709, loss_fea = 1.87458, acc = 0.86682


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 178/200 ] loss = 5.30805, acc = 0.77784


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 179/200 ] loss = 1.24940, loss_fea = 1.87435, acc = 0.86256


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 179/200 ] loss = 5.11001, acc = 0.78601


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 180/200 ] loss = 1.20069, loss_fea = 1.87510, acc = 0.86702


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 180/200 ] loss = 5.10821, acc = 0.78105


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 181/200 ] loss = 1.16936, loss_fea = 1.87974, acc = 0.86631


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 181/200 ] loss = 5.01933, acc = 0.79971 ---------------------> best
Best model found at epoch 181, saving model


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 182/200 ] loss = 1.12166, loss_fea = 1.87870, acc = 0.87046


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 182/200 ] loss = 4.98039, acc = 0.78134


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 183/200 ] loss = 1.06528, loss_fea = 1.87946, acc = 0.87503


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 183/200 ] loss = 5.25013, acc = 0.77843


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 184/200 ] loss = 1.04676, loss_fea = 1.88129, acc = 0.87107


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 184/200 ] loss = 5.17709, acc = 0.79271


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 185/200 ] loss = 1.00682, loss_fea = 1.88164, acc = 0.87786


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 185/200 ] loss = 5.01056, acc = 0.78630


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 186/200 ] loss = 0.95103, loss_fea = 1.88173, acc = 0.87715


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 186/200 ] loss = 5.40920, acc = 0.75802


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 187/200 ] loss = 0.91693, loss_fea = 1.88150, acc = 0.87726


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 187/200 ] loss = 5.31566, acc = 0.78513


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 188/200 ] loss = 0.88415, loss_fea = 1.88098, acc = 0.87361


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 188/200 ] loss = 5.27887, acc = 0.78746


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 189/200 ] loss = 0.83865, loss_fea = 1.88201, acc = 0.87847


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 189/200 ] loss = 5.11245, acc = 0.79475


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 190/200 ] loss = 0.78930, loss_fea = 1.88033, acc = 0.87675


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 190/200 ] loss = 5.33785, acc = 0.77201


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 191/200 ] loss = 0.74756, loss_fea = 1.88171, acc = 0.88131


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 191/200 ] loss = 5.24083, acc = 0.78659


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 192/200 ] loss = 0.70594, loss_fea = 1.88484, acc = 0.88425


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 192/200 ] loss = 5.45276, acc = 0.78834


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 193/200 ] loss = 0.67251, loss_fea = 1.88435, acc = 0.87959


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 193/200 ] loss = 5.40499, acc = 0.78805


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 194/200 ] loss = 0.61191, loss_fea = 1.88432, acc = 0.88567


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 194/200 ] loss = 5.45408, acc = 0.78659


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 195/200 ] loss = 0.57712, loss_fea = 1.88284, acc = 0.88090


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 195/200 ] loss = 5.33401, acc = 0.78047


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 196/200 ] loss = 0.53770, loss_fea = 1.88702, acc = 0.87888


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 196/200 ] loss = 5.57625, acc = 0.77988


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 197/200 ] loss = 0.45882, loss_fea = 1.88666, acc = 0.89611


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 197/200 ] loss = 5.83378, acc = 0.77289


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 198/200 ] loss = 0.43219, loss_fea = 1.88694, acc = 0.88374


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 198/200 ] loss = 6.06182, acc = 0.77259


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 199/200 ] loss = 0.37603, loss_fea = 1.88705, acc = 0.88486


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 199/200 ] loss = 6.04152, acc = 0.75219


  0%|          | 0/78 [00:00<?, ?it/s]

[ Train | 200/200 ] loss = 0.32729, loss_fea = 1.88768, acc = 0.88699


  0%|          | 0/27 [00:00<?, ?it/s]

[ Valid | 200/200 ] loss = 5.95700, acc = 0.77668
Finish training


In [17]:
# create dataloader for evaluation
eval_set = FoodDataset(os.path.join(cfg['dataset_root'], "evaluation"), tfm=test_tfm)
eval_loader = DataLoader(eval_set, batch_size=cfg['batch_size'], shuffle=False, num_workers=0, pin_memory=True)

One ../input/ml2022spring-hw13/food11-hw13/evaluation sample ../input/ml2022spring-hw13/food11-hw13/evaluation/0000.jpg


In [18]:
# Load model from {exp_name}/student_best.ckpt
student_model_best = get_student_model() # get a new student model to avoid reference before assignment.
ckpt_path = f"{save_path}/student_best.ckpt" # the ckpt path of the best student model.
student_model_best.load_state_dict(torch.load(ckpt_path, map_location='cpu')) # load the state dict and set it to the student model
student_model_best.to(device) # set the student model to device

# Start evaluate
student_model_best.eval()
eval_preds = [] # storing predictions of the evaluation dataset

# Iterate the validation set by batches.
for batch in tqdm(eval_loader):
    # A batch consists of image data and corresponding labels.
    imgs, _ = batch
    # We don't need gradient in evaluation.
    # Using torch.no_grad() accelerates the forward process.
    with torch.no_grad():
        logits = student_model_best(imgs.to(device))
        preds = list(logits.argmax(dim=-1).squeeze().cpu().numpy())
    # loss and acc can not be calculated because we do not have the true labels of the evaluation set.
    eval_preds += preds

def pad4(i):
    return "0"*(4-len(str(i))) + str(i)

# Save prediction results
ids = [pad4(i) for i in range(0,len(eval_set))]
categories = eval_preds

df = pd.DataFrame()
df['Id'] = ids
df['Category'] = categories
df.to_csv(f"{save_path}/submission.csv", index=False) # now you can download the submission.csv and upload it to the kaggle competition.

  0%|          | 0/27 [00:00<?, ?it/s]