In [1]:
import moxing as mox
import zipfile
import os
import time
import datetime
import copy
from collections import defaultdict
from tqdm import tqdm
import shutil
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from PIL import Image
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
from torch import nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam, SGD
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, utils

INFO:root:Using MoXing-v1.17.3-
INFO:root:Using OBS-Python-SDK-3.20.7


In [2]:
obs_root_path = "obs://tinyimages/TinyImageNet/"

In [3]:
mox.file.copy_parallel(obs_root_path + "TinyImageNet.zip", "TinyImageNet.zip")
zip_file = zipfile.ZipFile("./TinyImageNet.zip")
zip_list = zip_file.namelist()
for f in zip_list:
    zip_file.extract(f, "./")
zip_file.close()

In [4]:
ckpts = sorted([x for x in mox.file.list_directory(obs_root_path + "ckpts/") if x.endswith(".ckpt")])
if len(ckpts) > 0:
    last_ckpt = ckpts[-1]
    mox.file.copy_parallel(obs_root_path + "ckpts/" + last_ckpt, last_ckpt)
mox.file.copy_parallel(obs_root_path + "layers.py", "layers.py")
from layers import *

In [5]:
cuda_gpu = torch.cuda.is_available()
if cuda_gpu:
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu:0")
nclass = 100

In [6]:
EPOCHS = 30
BATCH_SIZE = 100
LR = 0.10
SAVE_EPOCH = 5
VAL_EPOCH = 5
verbose = 500
loss_func = nn.CrossEntropyLoss()

In [7]:
data_transforms = transforms.Compose([
    #transforms.RandomResizedCrop(64),
    transforms.RandomHorizontalFlip(),
    transforms.Resize(256),
    transforms.ToTensor()
    #transforms.Normalize([124,117,104],[58.4,57.1,57.4])
    ])

class ImageNetData(Dataset):
    
    def __init__(self, path, info, nclass):
        self.path = path
        info_path = os.path.join(path, info)
        f = open(info_path)
        self.data = f.read().splitlines()
        self.transforms = data_transforms
        self.nclass = nclass
    
    def __len__(self):
        return len(self.data)
        
        
    def __getitem__(self, index):
        img = Image.open(os.path.join(self.path, self.data[index].split(' ')[0]))
        img = self.transforms(img)
        label = self.data[index].split(' ')[1]
        label = int(label)
#         one_hot = np.zeros(self.nclass, dtype=np.int)
#         one_hot[label] = 1
        return (img, label)
        
path = "/home/ma-user/work/TinyImageNet/"
train_info_path = path + '/train.txt'
val_info_path = path + '/val.txt'
train_set = ImageNetData(path, "train.txt", nclass)
val_set = ImageNetData(path, "val.txt", nclass)

train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE)

In [8]:
ckpts = sorted([x for x in os.listdir("./") if x.endswith(".ckpt")])
model = Inception_ResNet_v2_SE().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
lr_reduct = lambda epochs: 0.5 ** (epochs // 20)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_reduct)
if len(ckpts) >= 1:
    state_dicts = torch.load(ckpts[-1])
    model.load_state_dict(state_dicts['model'])
    optimizer.load_state_dict(state_dicts['optimizer'])
    scheduler.load_state_dict(state_dicts['scheduler'])

In [9]:
train_loss = []
train_acc = []
val_loss = []
val_acc = []
with torch.autograd.set_detect_anomaly(True):
    for epochs in range(1, EPOCHS + 1):
        print('Epoch {}/{}'.format(epochs, EPOCHS))
        start_time = time.time()
        running_train_loss = []
        running_train_acc = []
        for i, data in enumerate(train_loader):
            inputs = data[0].to(device)
            targets = data[1].to(device)
            outputs = model(inputs)
            loss = loss_func(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            pred = outputs.argmax(axis=1)
            running_train_loss.append(float(loss))
            running_train_acc.append(float((pred==targets).sum()) / BATCH_SIZE)
            if verbose > 0 and (i + 1) % verbose == 0:
                print("Train: {}/{} - Average Time: {}".format(i + 1, len(train_loader), (time.time()-start_time) / (i + 1)))
        print("End of Train Epoch: {} - Average Time: {}".format(epochs, (time.time() - start_time) / (i + 1)))
        # 周期性清除CUDA缓存
        torch.cuda.empty_cache()
        if epochs % SAVE_EPOCH == 0:
            model_save_name = "model_{}.ckpt".format(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) 
            model_save_path = "/home/ma-user/work/" + model_save_name
            torch.save({'model': model.state_dict(), 
                       'optimizer': optimizer.state_dict(), 
                       'scheduler': scheduler.state_dict()},
                       model_save_path)
            mox.file.copy_parallel(model_save_path, 'obs://tinyimages/TinyImageNet/'+model_save_name)
        if epochs % VAL_EPOCH == 0:
            running_val_loss = []
            running_val_acc = []
            for i, data in enumerate(val_loader):
                inputs=data[0].to(device)
                targets=data[1].to(device)
                with torch.no_grad():
                    outputs = model(inputs)
                    loss = loss_func(outputs, targets)
                    pred = outputs.argmax(axis=1)
                    running_val_loss.append(float(loss))
                    running_val_acc.append(float((pred==targets).sum()) / BATCH_SIZE)
                if verbose > 0 and(i + 1) % verbose == 0:
                    print("Val: {}/{}".format(i + 1, len(val_loader)))
        
        scheduler.step()    
        train_loss.append(sum(running_train_loss) / len(running_train_loss))
        train_acc.append(sum(running_train_acc) / len(running_train_acc))
        duration = time.time() - start_time
        print('Train loss : {} Train accuracy : {}'.format(train_loss[-1], train_acc[-1]))
        if epochs % VAL_EPOCH == 0:
            val_loss.append(sum(running_val_loss) / len(running_val_loss))
            val_acc.append(sum(running_val_acc) / len(running_val_acc))
            print('Val   loss : {} Val   accuracy : {}'.format(val_loss[-1],val_acc[-1]))
        print('{:.0f}m {:.0f}s'.format(duration // 60, duration % 60))

Epoch 1/30
Train: 500/1000 - Average Time: 1.1169691324234008
Train: 1000/1000 - Average Time: 1.1162366991043091
End of Train Epoch: 1 - Average Time: 1.1162385911941528
Train loss : 4.611499159812928 Train accuracy : 0.010569999999999873
18m 36s
Epoch 2/30
Train: 500/1000 - Average Time: 1.1133054718971251
Train: 1000/1000 - Average Time: 1.1099235389232636
End of Train Epoch: 2 - Average Time: 1.1099253253936767
Train loss : 4.611687847137452 Train accuracy : 0.01050999999999988
18m 30s
Epoch 3/30
Train: 500/1000 - Average Time: 1.1059219875335693
Train: 1000/1000 - Average Time: 1.1035705137252807
End of Train Epoch: 3 - Average Time: 1.1035723752975464
Train loss : 4.6120483984947205 Train accuracy : 0.010149999999999888
18m 24s
Epoch 4/30
Train: 500/1000 - Average Time: 1.1027299327850342
Train: 1000/1000 - Average Time: 1.1024694149494172
End of Train Epoch: 4 - Average Time: 1.1024711301326753
Train loss : 4.611792888641357 Train accuracy : 0.010409999999999886
18m 23s
Epoch 5/

In [10]:
model_save_name = "model_{}.ckpt".format(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) 
model_save_path = "/home/ma-user/work/" + model_save_name
torch.save({'model': model.state_dict(), 
           'optimizer': optimizer.state_dict(), 
           'scheduler': scheduler.state_dict()},           model_save_path)
mox.file.copy_parallel(model_save_path, 'obs://tinyimages/TinyImageNet/'+model_save_name)