In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import copy
import numpy as np
import pandas as pd
import timeit
from IPython.display import display

from sklearn.preprocessing import MultiLabelBinarizer
from PIL import Image

from layers import Flatten

In [None]:
class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples

In [None]:
# Load data
train_path = './input/train-jpg/'
test_path = './input/test-jpg/'
train = pd.read_csv('./input/train_v2.csv')
test = pd.read_csv('./input/sample_submission_v2.csv')

In [None]:
print(train.shape)
print(test.shape)
NUM_TRAIN = 32000
NUM_VAL = train.shape[0]-NUM_TRAIN
NUM_TEST = test.shape[0]

In [None]:
print(train.iloc[0])
print(test.iloc[0])

In [None]:
display(train[0:10])

In [None]:
train['tags'][0].split()

TBD
Make loop to import all images & store as a numpy array of (3,32,32)'s
Save extracted image data somewhere so I don't need to preprocess each time
Convert text labels into multi-hot vectors, with vocab as the 17 labels in alphabetical order. 1 = agriculture, 2 = clear, etc.

In [None]:
vocab = [item for i in range(train.shape[0]) for item in train['tags'][i].split()]
vocab_ordered = sorted(set(vocab))
vocab_dict = {word: index for index, word in enumerate(vocab_ordered)}
vocab_dict

In [None]:
labels_inds = [[vocab_dict[word] for word in row.split()] for row in train['tags']] 

In [None]:
mlb = MultiLabelBinarizer()
labels_words = [set([word for word in row.split()]) for row in train['tags']]
labels = mlb.fit_transform(labels_words)

In [None]:
train_dataset = np.zeros((1000,3,256,256))
for i,image_name in enumerate(train['image_name'][:1000]):
    im = Image.open(train_path + image_name + '.jpg')
    im = np.array(im)[:,:,:3]
    im = np.reshape(im,(im.shape[2],im.shape[0],im.shape[1]))
    train_dataset[i,:,:,:] = im

In [None]:
train_dataset.shape

In [None]:
train_data = torch.from_numpy(train_dataset)
our_labels = torch.from_numpy(labels[:1000])
train_tensor_dataset = torch.utils.data.TensorDataset(train_data, our_labels)
print(train_tensor_dataset[0])
loader_train = torch.utils.data.DataLoader(train_tensor_dataset, batch_size=100, shuffle=True)

In [2]:
simple_model = nn.Sequential(
                nn.Conv2d(3, 3, kernel_size=3, stride=1),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(3),
                nn.MaxPool2d(kernel_size=2,stride=2),
                Flatten(),
                nn.Linear(48387,17)
              )
gpu_dtype = torch.cuda.FloatTensor
simple_model.type(gpu_dtype)

#check output dimensions before flattening
model_gpu = copy.deepcopy(simple_model).type(gpu_dtype)
model_gpu.eval()
x = torch.randn(10, 3, 256, 256).type(gpu_dtype)
x_var = Variable(x.type(gpu_dtype)) # Construct a PyTorch Variable out of your input data
scores = model_gpu(x_var)        # Feed it through the model! 
print(scores.size())

loss_fn = nn.MultiLabelSoftMarginLoss().type(gpu_dtype)
optimizer = optim.RMSprop(simple_model.parameters(), lr=1e-3, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

torch.Size([10, 17])


In [None]:
simple_model.train()

print_every = 1

# Load one batch at a time.
for t, (x, y) in enumerate(loader_train):
    x_var = Variable(x.type(gpu_dtype))
    y_var = Variable(y.type(gpu_dtype))

    # This is the forward pass: predict the scores for each class, for each x in the batch.
    scores = simple_model(x_var)
    
    # Use the correct y values and the predicted y values to compute the loss.
    loss = loss_fn(scores, y_var)
    
    if (t + 1) % print_every == 0:
        print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

    # Zero out all of the gradients for the variables which the optimizer will update.
    optimizer.zero_grad()
    
    # This is the backwards pass: compute the gradient of the loss with respect to each 
    # parameter of the model.
    loss.backward()
    
    # Actually update the parameters of the model using the gradients computed by the backwards pass.
    optimizer.step()

# Now, I'll try making a dataset subclass which does batch pulls of the data into RAM

Idea - ONLY change Dataset subclass. Don't want to touch dataloader at all.

In [None]:
train_data_path = './input/train-jpg/'
test_data_path = './input/test-jpg/'

train_labels_path ='./input/train_v2.csv' 
test_labels_path = None #it's not provided

#pd.read_csv(train_labels_path)

NUM_TRAIN = 32000
NUM_VAL = train.shape[0]-NUM_TRAIN
NUM_TEST = test.shape[0]

from torch.utils.data.dataset import Dataset
from torch.utils.data.sampler import SequentialSampler
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from PIL import Image


#this is the naive implementation which pulls from file every time you get an item. no caching
class NaiveDataset(Dataset):
    """Dataset wrapping data and target tensors. Naive implementation does data preprocessing per 'get_item' call

    Each sample will be retrieved by indexing both tensors along the first
    dimension.
    
    Arguments:
        data_path (str): path to image folder
        labels_path (str): path to csv containing labels per image
        num_examples (int): number of examples
    """
    def load_image(self, idx):
        image_name = self.labels_df['image_name'][idx]
        im = Image.open(self.data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]
        im = np.reshape(im,(im.shape[2],im.shape[0],im.shape[1]))
        return torch.from_numpy(im)
    
    def __init__(self, data_path=train_data_path, labels_path=train_labels_path,
                 num_examples=1000):
        self.labels_df = pd.read_csv(labels_path)
        assert num_examples <= self.labels_df.shape[0]
        self.num_examples = num_examples
        
        mlb = MultiLabelBinarizer()
        labels_words = [set([word for word in row.split()]) for row in self.labels_df['tags']]
        self.labels_tensor = torch.from_numpy(mlb.fit_transform(labels_words))
        
        self.data_path = data_path

    def __getitem__(self, idx):
        data_tensor = self.load_image(idx)
        target_tensor = self.labels_tensor[idx]
        return data_tensor,target_tensor

    def __len__(self):
        return self.num_examples

class DynamicDataset(Dataset):
    """Dataset wrapping data and target tensors with dynamic loading and buffering

    Each sample will be retrieved by indexing both tensors along the first
    dimension.
    
    Precondition - buffer_size must be a multiple of num_examples (relax this later)

    Arguments:
        data_path (str): path to image folder
        labels_path (str): path to csv containing labels per image
        num_examples (int): number of examples
        buffer_size (int): size of precaching buffer
        rand_seed (None/int): if None, go sequentially. If <0, use system clock for seed. If >0, use seed value
    """

    def __init__(self, data_path=train_data_path, labels_path=train_labels_path,
                 num_examples=1000, buffer_size=1000, rand_seed = None):
        self.labels_df = pd.read_csv(labels_path)
        assert num_examples <= self.labels_df.shape[0]
        assert num_examples >= buffer_size
        assert num_examples % buffer_size == 0
        
        mlb = MultiLabelBinarizer()
        labels_words = [set([word for word in row.split()]) for row in self.labels_df['tags']]
        self.labels_tensor = torch.from_numpy(mlb.fit_transform(labels_words))
        
        self.num_examples = num_examples
        self.buffer_size = buffer_size
        self.rand_seed = 0
        self.buffer_index = 0
        
        if rand_seed is None:
            self.inds_array = np.arange(num_examples)
        elif rand_seed<=0:
            self.inds_array = np.random.permutation(num_examples)
        elif rand_seed>0:
            np.random.seed(rand_seed)
            torch.manual_seed(rand_seed)
            self.inds_array = np.random.permutation(num_examples)

        self.data_path = data_path
        
        self.data_tensor = np.zeros((buffer_size,3,256,256))
        self.backup_buffer = np.zeros(self.data_tensor.shape)
        
        for i in range(self.buffer_index*buffer_size,self.buffer_index*buffer_size+1):
            self.data_tensor[i,:,:,:] = self.load_image(i)
        self.data_tensor = torch.from_numpy(self.data_tensor)
        
    def load_image(self, idx):
        image_name = self.labels_df['image_name'][idx]
        im = Image.open(self.data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]
        im = np.reshape(im,(im.shape[2],im.shape[0],im.shape[1]))
        return im  
    
    def fill_buffer(self):
        self.backup_buffer = np.zeros((self.buffer_size,3,256,256)) #does this clear the GPU RAM properly? Monitor memory..
        self.buffer_index += 1
        for i in range(int(self.buffer_size)):
            self.backup_buffer[i,:,:,:] = self.load_image(i+self.buffer_index*self.buffer_size)
        self.backup_buffer = torch.from_numpy(self.backup_buffer)

    def __getitem__(self, index):
        if index>self.buffer_index*self.buffer_size/2:
            self.fill_buffer()
        elif index>=self.buffer_index*self.buffer_size:
            self.data_tensor = self.backup_buffer #does this do assignment properly w/o causing a GPU/CPU RAM memory leak?
        return self.data_tensor[index%self.buffer_size], self.labels_tensor[index%self.buffer_size]

    def __len__(self):
        return self.data_tensor.size(0)

#Since dataloaders are created in conjunction with samplers, and because of our RAM constraint when loading data,
#We needed to create this helper function to produce a dataloader object with the appropraite sampler. Without this helper
#function, there is a risk that the Sampler would not be able to sample random pictures properly
def createFastLoaderWithSampler(data_path=train_data_path, labels_path=train_labels_path,
                 num_examples=1000, buffer_size=1000, rand_seed = None, batch_size=100):
    dynamic_dataset = DynamicDataset(data_path, labels_path,
                 num_examples, buffer_size, rand_seed)
    return torch.utils.data.DataLoader(dynamic_dataset, batch_size=100, 
                                       shuffle=(rand_seed is not None), sampler=SequentialSampler(dynamic_dataset))
    

In [None]:
train_naive_dataset = NaiveDataset()
train_dynamic_loading_dataset = DynamicDataset()
loader_fast_train = createFastLoaderWithSampler(num_examples=NUM_TRAIN,buffer_size=4000)

In [None]:
%%timeit
simple_model.train()

#loader_dynamic_train = torch.utils.data.DataLoader(train_dynamic_loading_dataset, batch_size=100, shuffle=True)

print_every = 1

# Load one batch at a time.
for t, (x, y) in enumerate(loader_fast_train):
    x_var = Variable(x.type(gpu_dtype))
    y_var = Variable(y.type(gpu_dtype))

    # This is the forward pass: predict the scores for each class, for each x in the batch.
    scores = simple_model(x_var)
    
    # Use the correct y values and the predicted y values to compute the loss.
    loss = loss_fn(scores, y_var)
    
    if (t + 1) % print_every == 0:
        print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

    # Zero out all of the gradients for the variables which the optimizer will update.
    optimizer.zero_grad()
    
    # This is the backwards pass: compute the gradient of the loss with respect to each 
    # parameter of the model.
    loss.backward()
    
    # Actually update the parameters of the model using the gradients computed by the backwards pass.
    optimizer.step()

In [3]:
from data_utils import createFastLoaderWithSampler

In [7]:
NUM_TRAIN = 40000
BUFFER_SIZE = 4000

loader_fast_train = createFastLoaderWithSampler(num_examples=NUM_TRAIN,buffer_size=BUFFER_SIZE)

print_every = 1

# Load one batch at a time.
for t, (x, y) in enumerate(loader_fast_train):
    x_var = Variable(x.type(gpu_dtype))
    y_var = Variable(y.type(gpu_dtype))

    # This is the forward pass: predict the scores for each class, for each x in the batch.
    scores = simple_model(x_var)
    
    # Use the correct y values and the predicted y values to compute the loss.
    loss = loss_fn(scores, y_var)
    
    if (t + 1) % print_every == 0:
        print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

    # Zero out all of the gradients for the variables which the optimizer will update.
    optimizer.zero_grad()
    
    # This is the backwards pass: compute the gradient of the loss with respect to each 
    # parameter of the model.
    loss.backward()
    
    # Actually update the parameters of the model using the gradients computed by the backwards pass.
    optimizer.step()

t = 1, loss = 0.2699
t = 2, loss = 0.2656
t = 3, loss = 0.2769
t = 4, loss = 0.2524
t = 5, loss = 0.2582
t = 6, loss = 0.2780
t = 7, loss = 0.2382
t = 8, loss = 0.2767
t = 9, loss = 0.2550
t = 10, loss = 0.2497
t = 11, loss = 0.2414
t = 12, loss = 0.2589
t = 13, loss = 0.2431
t = 14, loss = 0.2660
t = 15, loss = 0.2521
t = 16, loss = 0.2740
t = 17, loss = 0.2566
t = 18, loss = 0.2724
t = 19, loss = 0.2599
t = 20, loss = 0.2503
t = 21, loss = 0.2935
t = 22, loss = 0.2610
t = 23, loss = 0.2608
t = 24, loss = 0.2747
t = 25, loss = 0.2630
t = 26, loss = 0.2357
t = 27, loss = 0.2982
t = 28, loss = 0.2687
t = 29, loss = 0.2615
t = 30, loss = 0.2539
t = 31, loss = 0.2636
t = 32, loss = 0.2699
t = 33, loss = 0.2552
t = 34, loss = 0.2583
t = 35, loss = 0.2610
t = 36, loss = 0.2748
t = 37, loss = 0.2641
t = 38, loss = 0.2572
t = 39, loss = 0.2420
t = 40, loss = 0.2756
