In [20]:
import os
import torch
import cv2

from torch.utils.data import Dataset, DataLoader


from data import BaseTransform
from ssd import build_ssd

from data import VOC_CLASSES
labelmap = VOC_CLASSES

use_cuda = True
batch_size = 20

'''
if use_cuda and torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')
'''
    
def load_net(cache_path):
    num_classes = len(VOC_CLASSES) + 1 # +1 background
    net = build_ssd('test', 300, num_classes) # initialize SSD
    net.load_state_dict(torch.load(cache_path))
    net.eval()
    return net


class SimpleDataset(Dataset):
    def __init__(self, root, transform):
        self.root = root
        self.listdir = os.listdir(self.root)
        #self.set_listdir = set(self.listdir)
        self.transform = transform
    def __len__(self):
        return len(self.listdir)
    def __getitem__(self, idx):
        path = os.path.join(self.root, self.listdir[idx])
        img = cv2.imread(path, cv2.IMREAD_COLOR)
        x = torch.from_numpy(self.transform(img)[0]).permute(2, 0, 1)
        #scale = torch.Tensor([img.shape[1], img.shape[0],
        #             img.shape[1], img.shape[0]])
        return x#, scale

In [2]:
torch.set_default_tensor_type('torch.FloatTensor')

In [3]:
net = load_net('weights/ssd300_COCO_6000.pth')
net.eval()
base_transform = BaseTransform(net.size, (104, 117, 123))

In [4]:
dataset = SimpleDataset('images_mini', base_transform)
dataloader = DataLoader(dataset, batch_size, shuffle = False, pin_memory=False)

先测纯cpu

In [5]:
#net.cpu();

In [6]:
%%time
for batch in dataloader:
    with torch.no_grad():
        y = net(batch)

Wall time: 58.5 s


In [21]:
torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [22]:
net = load_net('weights/ssd300_COCO_6000.pth')
net.eval()
base_transform = BaseTransform(net.size, (104, 117, 123))

In [23]:
dataset = SimpleDataset('images_mini', base_transform)
dataloader = DataLoader(dataset, batch_size, shuffle = False, pin_memory=False)

In [24]:
%%time
for batch in dataloader:
    batch = batch.cuda()
    with torch.no_grad():
        y = net(batch)

Wall time: 26.2 s


In [11]:
dataset = SimpleDataset('images_mini', base_transform)
dataloader = DataLoader(dataset, batch_size, shuffle = False, pin_memory=True)

In [12]:
%%time
for batch in dataloader:
    batch = batch.cuda()
    with torch.no_grad():
        y = net(batch)

Wall time: 24.3 s


本来应该快很多的，这不够快显然说明大量时间浪费在运算以外的事了。

In [13]:
dataset = SimpleDataset('images_mini', base_transform)
dataloader = DataLoader(dataset, batch_size, shuffle = False, pin_memory=True)

In [14]:
class data_prefetcher():
    def __init__(self, loader):
        self.loader = iter(loader)
        self.stream = torch.cuda.Stream()
        #self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
        #self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
        self.preload()

    def preload(self):
        try:
            #self.next_input, self.next_target = next(self.loader)
            self.next_input = next(self.loader)
        except StopIteration:
            self.next_input = None
            #self.next_target = None
            return
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(non_blocking=True)
            #self.next_target = self.next_target.cuda(non_blocking=True)
            self.next_input = self.next_input.float()
            #self.next_input = self.next_input.sub_(self.mean).div_(self.std)
            
    def next(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        input = self.next_input
        #target = self.next_target
        self.preload()
        return input#, target

In [15]:
prefetcher = data_prefetcher(dataloader)

In [16]:
%%time
batch = prefetcher.next()

while batch is not None:
    #batch = batch.cuda()
    with torch.no_grad():
        y = net(batch)
    batch = prefetcher.next()

Wall time: 24.2 s


In [17]:
%%time

prefetcher = data_prefetcher(dataloader)
batch = prefetcher.next()

while batch is not None:
    #batch = batch.cuda()
    with torch.no_grad():
        y = net(batch)
    batch = prefetcher.next()

Wall time: 24.5 s


EMMM...这并没有快到哪去。。

In [18]:
%%time

prefetcher = data_prefetcher(dataloader)
batch = prefetcher.next()
while batch is not None:
    #batch = batch.cuda()
    #with torch.no_grad():
    #    y = net(batch)
    batch = prefetcher.next()

Wall time: 9.36 s


In [19]:
%%time

prefetcher = data_prefetcher(dataloader)
batch = prefetcher.next()
while batch is not None:
    #batch = batch.cuda()
    #with torch.no_grad():
    #    y = net(batch)
    batch = prefetcher.next()

Wall time: 9.56 s


In [20]:
%%time
for batch in dataloader:
    batch = batch.cuda()
    #with torch.no_grad():
    #    y = net(batch)

Wall time: 9.47 s


In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

In [4]:
torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [25]:
A = torch.randn(int(2e8))

In [26]:
B = torch.randn(int(2e8))

In [13]:
with torch.no_grad():
    C = A * B

In [30]:
%%time
for i in range(2000):
    with torch.no_grad():
        C = A * B

Wall time: 4.74 s


In [31]:
A = A.cpu()
B = B.cpu()

In [32]:
%%time
for i in range(2000):
    with torch.no_grad():
        C = A * B

Wall time: 11min 6s
