In [6]:
import torch
from torch.utils.data import DataLoader
from dataset.dataset import get_cdiscount_dataset
from model.model import assemble_model, assemble_model_with_classifier
from trainer.trainer import get_trainer

import numpy as np
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from torch_deform_conv.layers import ConvOffset2D
from torch_deform_conv.cnn import get_vgg11_bn, get_vgg11_bn_deform
from torch_deform_conv.utils import transfer_weights

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# redirect print to file
# import sys
# sys.stdout = open("PyTorch-resnet34-log.txt", "w")

In [7]:
# configuration
config = {
    'train_batch_size': 128, 'val_batch_size': 128,
    'arch': 'vgg11_bn', 'pretrained': True,
    'optimizer': 'Adam', 'learning_rate': 1e-4, 'decay_lr_freq': 4e4, 'weight_decay': 1e-5,
    'resume': None,
    'start_epoch': 0, 'epochs': 10,
    'print_freq': 10, 'validate_freq': 7e4, 'save_freq': 1e3,
    'best_val_prec1': 0
}

In [8]:
import torchvision.models as models

# get dataset
print('getting dataset...')
train_dataset = get_cdiscount_dataset(offsets_csv="train_offsets.csv",
                                      images_csv="train_images.csv",
                                      bson_file_path="/mnt/data/cdiscount/train.bson",
                                      with_label=True,
                                      resize=160)
val_dataset = get_cdiscount_dataset(offsets_csv="train_offsets.csv",
                                    images_csv="val_images.csv",
                                    bson_file_path="/mnt/data/cdiscount/train.bson",
                                    with_label=True,
                                    resize=160)

# get data loader
print('getting data loader...')
train_dataloader = DataLoader(train_dataset, batch_size=config['train_batch_size'], shuffle=True, num_workers=6)
val_dataloader = DataLoader(val_dataset, batch_size=config['val_batch_size'], shuffle=True, num_workers=6)



getting dataset...
getting data loader...


In [9]:
# define base line cnn model
#model = get_vgg11_bn()
#model = models.__dict__[config['arch']](pretrained=False)
model = get_vgg11_bn()

model = assemble_model(model, -1, 12800, 5270)
model = torch.nn.DataParallel(model).cuda()
print(model)

DataParallel (
  (module): AssembledModel (
    (model): Sequential (
      (0): Sequential (
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (2): ReLU (inplace)
        (3): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
        (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
        (6): ReLU (inplace)
        (7): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
        (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
        (10): ReLU (inplace)
        (11): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
        (13): ReLU (inplace)
        (14): MaxPool2d (size=(2, 2), stride=(2, 2), dilation

In [10]:
# define loss function (criterion) and optimizer
criterion = torch.nn.CrossEntropyLoss().cuda()

# get trainer
Trainer = get_trainer(train_dataloader, val_dataloader, model, criterion, config)

# Run!
Trainer.run()

start training
Epoch: [0][0/77344]	Time 27.574 (27.574)	Data 3.946 (3.946)	Loss 8.8177 (8.8177)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][10/77344]	Time 1.213 (3.548)	Data 0.006 (0.366)	Loss 8.3564 (8.5310)	Prec@1 2.344 (1.207)	Prec@5 4.688 (3.551)
Epoch: [0][20/77344]	Time 1.226 (2.439)	Data 0.011 (0.195)	Loss 7.7380 (8.2250)	Prec@1 6.250 (2.307)	Prec@5 12.500 (5.990)
Epoch: [0][30/77344]	Time 1.212 (2.044)	Data 0.007 (0.135)	Loss 7.4419 (7.9778)	Prec@1 5.469 (2.848)	Prec@5 11.719 (6.930)
Epoch: [0][40/77344]	Time 1.223 (1.843)	Data 0.008 (0.104)	Loss 7.1791 (7.7946)	Prec@1 2.344 (3.182)	Prec@5 14.062 (7.736)
Epoch: [0][50/77344]	Time 1.201 (1.720)	Data 0.008 (0.085)	Loss 7.2481 (7.6578)	Prec@1 7.812 (3.600)	Prec@5 14.844 (8.395)
Epoch: [0][60/77344]	Time 1.214 (1.638)	Data 0.010 (0.073)	Loss 7.0288 (7.5768)	Prec@1 6.250 (3.778)	Prec@5 12.500 (8.786)
Epoch: [0][70/77344]	Time 1.218 (1.579)	Data 0.008 (0.064)	Loss 7.3221 (7.5126)	Prec@1 7.031 (4.060)	Prec@5 14.844 (9.254)
Ep

Epoch: [0][660/77344]	Time 1.221 (1.272)	Data 0.009 (0.016)	Loss 5.8275 (6.2699)	Prec@1 10.938 (11.657)	Prec@5 21.094 (21.597)
Epoch: [0][670/77344]	Time 1.239 (1.272)	Data 0.009 (0.016)	Loss 5.3448 (6.2575)	Prec@1 18.750 (11.747)	Prec@5 31.250 (21.755)
Epoch: [0][680/77344]	Time 1.225 (1.271)	Data 0.009 (0.016)	Loss 5.3483 (6.2440)	Prec@1 18.750 (11.864)	Prec@5 35.938 (21.917)
Epoch: [0][690/77344]	Time 1.231 (1.271)	Data 0.014 (0.016)	Loss 5.4893 (6.2334)	Prec@1 21.875 (11.952)	Prec@5 34.375 (22.034)
Epoch: [0][700/77344]	Time 1.243 (1.270)	Data 0.018 (0.016)	Loss 5.4468 (6.2247)	Prec@1 18.750 (12.043)	Prec@5 34.375 (22.169)
Epoch: [0][710/77344]	Time 1.232 (1.270)	Data 0.009 (0.016)	Loss 5.2205 (6.2154)	Prec@1 19.531 (12.100)	Prec@5 33.594 (22.277)
Epoch: [0][720/77344]	Time 1.226 (1.269)	Data 0.011 (0.016)	Loss 6.0006 (6.2042)	Prec@1 17.188 (12.200)	Prec@5 26.562 (22.406)
Epoch: [0][730/77344]	Time 1.216 (1.269)	Data 0.011 (0.016)	Loss 5.5288 (6.1930)	Prec@1 18.750 (12.281)	Prec@5 

Process Process-9:
KeyboardInterrupt
Process Process-10:
Process Process-7:
Process Process-12:
Process Process-11:
Process Process-8:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()


KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/weiso/.local/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 34, in _worker_loop
    r = index_queue.get()
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/ho