In [None]:
# 基礎資料處理切分
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Deep Learning 相關
import torch
from torch import nn, optim, utils

# 系統互動
import time
import argparse

# 自訂部分
from model_hie import ResNet50MothClassifier
from dataset_hie import ImageDatasetFromFileSpecial
from average_meter import AverageMeter
from center_loss import CenterLoss

opt_dataroot = "./downloaded256/"
opt_batchSize = 48

# 讀取資料 & 前處理
dataset_path = 'sp_meta.csv'
df = pd.read_csv(dataset_path, sep="\t")

# 移除未鑑定到種的資料
df = df[~df.Species.isna()].reset_index(drop=True)

genus = [s.split(' ')[0] for s in df.Species]
df['Genus'] = genus

# 造出物種清單，並產生每筆資料對應的物種 id, 視為 classification 用的 target y
species_list, species_id = np.unique(df.Species, return_inverse=True)
family_list, family_id = np.unique(df.Family, return_inverse=True)
genus_list, genus_id = np.unique(df.Genus, return_inverse=True)

x_train = np.load('./datasplit_cache/x_train.npy', allow_pickle=True)
y_train = np.load('./datasplit_cache/y_train.npy', allow_pickle=True)
x_valid = np.load('./datasplit_cache/x_valid.npy', allow_pickle=True)
y_valid = np.load('./datasplit_cache/y_valid.npy', allow_pickle=True)
x_test = np.load('./datasplit_cache/x_test.npy', allow_pickle=True)
y_test = np.load('./datasplit_cache/y_test.npy', allow_pickle=True)

x_train_new = [f'./downloaded256/{pth.split("/")[-1]}' for pth in x_train]
x_valid_new = [f'./downloaded256/{pth.split("/")[-1]}' for pth in x_valid]
x_test_new = [f'./downloaded256/{pth.split("/")[-1]}' for pth in x_test]
# 需要把 x 裡的路徑修改成雲端的路徑
# 寫法例如 x_train_new = [('./path/to/images/%s' % f.split('/')[-1]) for f in x_train]
# y 是 label, 但順序跟你原本用的相反，0 是 family, 1 是 genus, 2 是 species


'''
# 讀取資料 & 前處理
dataset_path = 'sp_mod.csv'
df = pd.read_csv(dataset_path, sep="\t")

# 移除未鑑定到種的資料
df = df[~df.Species.isna()].reset_index(drop=True)

# Remove data without Genus or Family
df = df[~df.Genus.isna()].reset_index(drop=True)
df = df[~df.Family.isna()].reset_index(drop=True)

# 造出物種清單，並產生每筆資料對應的物種 id, 視為 classification 用的 target y
species_list, species_id = np.unique(df.Species, return_inverse=True)
genus_list, genus_id = np.unique(df.Genus, return_inverse=True)
family_list, family_id = np.unique(df.Family, return_inverse=True)
y = np.array(list(zip(species_id, genus_id, family_id)))

# 產生影像路徑
x = img_paths = ('./downloaded256/' + df.Number + '.jpg').values

# 切 train, valid, test
x_train_valid, x_test, y_train_valid, y_test = train_test_split(x, y,  train_size=.8, test_size=.2, random_state=5566)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_valid, y_train_valid,  train_size=.8, test_size=.2, random_state=5566)
'''

train_set = ImageDatasetFromFileSpecial(x_train_new, "", y=y_train, aug=True)
train_data_loader = utils.data.DataLoader(train_set, batch_size=opt_batchSize, shuffle=True)

# valid 與 test 時不需要做 augmentation
valid_set = ImageDatasetFromFileSpecial(x_valid_new, "", y=y_valid, aug=False)
valid_data_loader = utils.data.DataLoader(valid_set, batch_size=opt_batchSize, shuffle=False)

# 儲存模型
def save_checkpoint(model, model_out_path = "resnet50_moth_classifier_hie.pth"):
    torch.save({"model": model}, model_out_path)
    print("Checkpoint saved to {}".format(model_out_path))

# init model, 把模型搬進 GPU:0 的記憶體中
model = ResNet50MothClassifier(num_of_species=len(species_list), num_of_genus=len(genus_list), num_of_family=len(family_list)).to('cuda:0')
# 設定 optimizer
# adam = optim.Adam(model.parameters(), lr=5e-5, betas=(0.5, 0.999), weight_decay=1e-3)
# 設定分類器用的 cross entropy loss
cross_entropy = nn.CrossEntropyLoss()

# Initialize center loss and optimizers
center_loss = CenterLoss(num_classes=len(species_list), feat_dim=2048, use_gpu=True)
adam = optim.Adam(model.parameters(), lr=5e-5, betas=(0.5, 0.999), weight_decay=1e-3)
centerloss_optimizer = optim.Adam(center_loss.parameters(), lr=5e-2, betas=(0.5, 0.999), weight_decay=1e-3)
centerloss_alpha = 0.005

# init min loss criteria
min_criteria = np.inf

early_stop_threshold = 50
early_stop_counter = 0

# 開啟空白 log 檔
with open('./resnet50_moth_classifier_hie.log','w') as loss_log:
    loss_log.write(
            "\t".join([
                '%5s' % 'epoch', 
                '%10s' % 'time_cost',
                '%10s' % 'train_loss',
                '%10s' % 'valid_loss',
                '%10s' % 't_center',
                '%10s' % 't_species',
                '%10s' % 't_genus',
                '%10s' % 't_family',
                '%10s' % 'v_center',
                '%10s' % 'v_species',
                '%10s' % 'v_genus',
                '%10s' % 'v_family',
                '\n',
            ])
        )

# 計算總花費時數
start_time = time.time()
for epoch in range(0, 200 + 1):  

    # 簡單的平均值計算器
    train_loss = AverageMeter()
    train_loss_ctr = AverageMeter()
    train_loss_spc = AverageMeter()
    train_loss_gns = AverageMeter()
    train_loss_fml = AverageMeter()
    valid_loss = AverageMeter()
    valid_loss_ctr = AverageMeter()
    valid_loss_spc = AverageMeter()
    valid_loss_gns = AverageMeter()
    valid_loss_fml = AverageMeter()
    batch_time = AverageMeter()

    #--------------train------------
    model = model.train()
    center_loss = center_loss.train()
    for iteration, (auged, label) in enumerate(train_data_loader, 0):
        
        batch_start_time = time.time()
        auged_cuda = auged.to('cuda:0')
        feat, species_output, genus_output, family_output = model(auged_cuda)
        # 注意在 model 中並沒有對輸出做 softmax
        # 因為 pytorch 在 cross entropy loss 裡面有內建了
        ctr_loss = center_loss(feat.view(-1, 2048), label[:, 2].to('cuda:0')) * centerloss_alpha
        spc_loss = cross_entropy(species_output, label[:, 2].to('cuda:0'))
        gns_loss = cross_entropy(genus_output, label[:, 1].to('cuda:0'))
        fml_loss = cross_entropy(family_output, label[:, 0].to('cuda:0'))
        loss = ctr_loss + spc_loss + gns_loss + fml_loss
        # 將前次的梯度歸零
        adam.zero_grad()
        centerloss_optimizer.zero_grad()
        # 計算本次的梯度
        loss.backward()
        # 更新模型參數
        adam.step()
        # multiple (1./alpha) in order to remove the effect of alpha on updating centers
        for param in center_loss.parameters():
            param.grad.data *= (1. / centerloss_alpha)
        centerloss_optimizer.step()
        
        # 計算花費時數
        time_cost = time.time() - start_time
        batch_time_cost = time.time() - batch_start_time
        batch_time.update(batch_time_cost)
        
        # 紀錄 training loss per batch in a epoch
        train_loss.update(loss.item())
        train_loss_ctr.update(ctr_loss.item())
        train_loss_spc.update(spc_loss.item())
        train_loss_gns.update(gns_loss.item())
        train_loss_fml.update(fml_loss.item())

        info = "====> Training Epoch[{}]({}/{}); time:({:.2f}/{:.2f}); train loss:{:.2f}; center:{:.2f}; species:{:.2f}; genus:{:.2f}; family:{:.2f}".format(epoch+1, iteration+1, len(train_data_loader), batch_time.avg, time_cost, train_loss.avg, train_loss_ctr.avg, train_loss_spc.avg, train_loss_gns.avg, train_loss_fml.avg)
        print(info, end='\r')
        
        del auged_cuda, species_output, genus_output, family_output
    
    print()

    #--------------valid------------
    with torch.no_grad():
        batch_time = AverageMeter()
        # 將模型設置成 eval 模式，固定住 weights, bias 等參數 (包括 batch norm)
        model = model.eval()
        center_loss = center_loss.eval()
        for iteration, (img_for_valid, label) in enumerate(valid_data_loader, 0):

            batch_start_time = time.time()
            img_for_valid_cuda = img_for_valid.to('cuda:0')
            feat, species_output, genus_output, family_output = model(img_for_valid_cuda)

            ctr_loss = center_loss(feat.view(-1, 2048), label[:, 2].to('cuda:0')) * centerloss_alpha
            spc_loss = cross_entropy(species_output, label[:, 2].to('cuda:0'))
            gns_loss = cross_entropy(genus_output, label[:, 1].to('cuda:0'))
            fml_loss = cross_entropy(family_output, label[:, 0].to('cuda:0'))
            loss = ctr_loss + spc_loss + gns_loss + fml_loss

            time_cost = time.time() - start_time
            batch_time_cost = time.time() - batch_start_time
            batch_time.update(batch_time_cost)
            
            # 紀錄 valid loss per batch in a epoch
            valid_loss.update(loss.item())
            valid_loss_ctr.update(ctr_loss.item())
            valid_loss_spc.update(spc_loss.item())
            valid_loss_gns.update(gns_loss.item())
            valid_loss_fml.update(fml_loss.item())

            info = "====> Validation ({}/{}): time:({:.2f}/{:.2f}): valid loss:{:.2f}; center:{:.2f}; species:{:.2f}; genus:{:.2f}; family:{:.2f}".format(iteration+1, len(valid_data_loader), batch_time.avg, time_cost, valid_loss.avg, valid_loss_ctr.avg, valid_loss_spc.avg, valid_loss_gns.avg, valid_loss_fml.avg)
            print(info, end='\r')

            del img_for_valid_cuda, species_output, genus_output, family_output
    
    print()    

    valid_loss_avg = valid_loss.avg

    criteria = valid_loss_avg

    # 如果 loss 差值太小無實質意義，可用 min_delta 控制
    min_delta = 0
    if min_criteria - criteria >= min_delta:
        min_criteria = criteria
        print ("Min criteria: %.2f" % min_criteria)
        save_checkpoint(model, model_out_path = "resnet50_moth_classifier_hie.pth")
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        print ("Early stop counter: %d" % early_stop_counter)

    if early_stop_counter > early_stop_threshold:
        print ("Early stopped.")
        break

    with open('./resnet50_moth_classifier_hie.log','a') as loss_log:
        loss_log.write(
            "\t".join([
                '%5s' % str(epoch+1), 
                '%10.2f' % time_cost,
                '%10.6f' % train_loss.avg,
                '%10.6f' % valid_loss.avg,
                '%10.6f' % train_loss_ctr.avg,
                '%10.6f' % train_loss_spc.avg,
                '%10.6f' % train_loss_gns.avg,
                '%10.6f' % train_loss_fml.avg,
                '%10.6f' % valid_loss_ctr.avg,
                '%10.6f' % valid_loss_spc.avg,
                '%10.6f' % valid_loss_gns.avg,
                '%10.6f' % valid_loss_fml.avg,
                '\n',
            ])
        )

# 沒寫 test 的部分，可以先自己試想試做看看

Using cache found in /home/jovyan/.cache/torch/hub/pytorch_vision_v0.3.0


====> Training Epoch[1](713/713); time:(0.31/484.56); train loss:13.24; center:0.85; species:6.18; genus:5.08; family:1.14
====> Validation (180/180): time:(0.09/518.02): valid loss:9.30; center:0.55; species:4.74; genus:3.45; family:0.55
Min criteria: 9.30
Checkpoint saved to resnet50_moth_classifier_hie.pth
====> Training Epoch[2](713/713); time:(0.31/1001.99); train loss:7.57; center:0.68; species:3.95; genus:2.58; family:0.36
====> Validation (180/180): time:(0.09/1035.56): valid loss:5.91; center:0.76; species:3.16; genus:1.79; family:0.20
Min criteria: 5.91
Checkpoint saved to resnet50_moth_classifier_hie.pth
====> Training Epoch[3](713/713); time:(0.31/1519.81); train loss:5.18; center:0.72; species:2.81; genus:1.46; family:0.18
====> Validation (180/180): time:(0.09/1553.19): valid loss:4.39; center:0.77; species:2.37; genus:1.11; family:0.14
Min criteria: 4.39
Checkpoint saved to resnet50_moth_classifier_hie.pth
====> Training Epoch[4](423/713); time:(0.31/1840.23); train loss