In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import copy
import numpy as np
import pandas as pd
import timeit
from IPython.display import display

from sklearn.preprocessing import MultiLabelBinarizer
from PIL import Image

from layers import Flatten

In [2]:
class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples

In [3]:
# Load data
train_path = './input/train-jpg/'
test_path = './input/test-jpg/'
train = pd.read_csv('./input/train_v2.csv')
test = pd.read_csv('./input/sample_submission_v2.csv')

In [4]:
print(train.shape)
print(test.shape)
NUM_TRAIN = 32000
NUM_VAL = train.shape[0]-NUM_TRAIN
NUM_TEST = test.shape[0]

(40479, 2)
(61191, 2)


In [5]:
print(train.iloc[0])
print(test.iloc[0])

image_name         train_0
tags          haze primary
Name: 0, dtype: object
image_name                                  test_0
tags          primary clear agriculture road water
Name: 0, dtype: object


In [6]:
display(train[0:10])

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
5,train_5,haze primary water
6,train_6,agriculture clear cultivation primary water
7,train_7,haze primary
8,train_8,agriculture clear cultivation primary
9,train_9,agriculture clear cultivation primary road


In [7]:
train['tags'][0].split()

['haze', 'primary']

TBD
Make loop to import all images & store as a numpy array of (3,32,32)'s
Save extracted image data somewhere so I don't need to preprocess each time
Convert text labels into multi-hot vectors, with vocab as the 17 labels in alphabetical order. 1 = agriculture, 2 = clear, etc.

In [8]:
vocab = [item for i in range(train.shape[0]) for item in train['tags'][i].split()]
vocab_ordered = sorted(set(vocab))
vocab_dict = {word: index for index, word in enumerate(vocab_ordered)}
vocab_dict

{'agriculture': 0,
 'artisinal_mine': 1,
 'bare_ground': 2,
 'blooming': 3,
 'blow_down': 4,
 'clear': 5,
 'cloudy': 6,
 'conventional_mine': 7,
 'cultivation': 8,
 'habitation': 9,
 'haze': 10,
 'partly_cloudy': 11,
 'primary': 12,
 'road': 13,
 'selective_logging': 14,
 'slash_burn': 15,
 'water': 16}

In [9]:
labels_inds = [[vocab_dict[word] for word in row.split()] for row in train['tags']] 

In [10]:
mlb = MultiLabelBinarizer()
labels_words = [set([word for word in row.split()]) for row in train['tags']]
labels = mlb.fit_transform(labels_words)

In [11]:
train_dataset = np.zeros((1000,3,256,256))
for i,image_name in enumerate(train['image_name'][:1000]):
    im = Image.open(train_path + image_name + '.jpg')
    im = np.array(im)[:,:,:3]
    im = np.reshape(im,(im.shape[2],im.shape[0],im.shape[1]))
    train_dataset[i,:,:,:] = im

In [12]:
train_dataset.shape

(1000, 3, 256, 256)

In [13]:
train_data = torch.from_numpy(train_dataset)
our_labels = torch.from_numpy(labels[:1000])
train_tensor_dataset = torch.utils.data.TensorDataset(train_data, our_labels)
print(train_tensor_dataset[0])
loader_train = torch.utils.data.DataLoader(train_tensor_dataset, batch_size=100, shuffle=True)

(
( 0 ,.,.) = 
  158  143  150  ...   146  153  163
  146  153  160  ...   151  164  148
  151  165  148  ...   160  146  152
      ...         ⋱        ...      
  147  156  166  ...   149  160  146
  152  161  145  ...   160  141  150
  166  147  153  ...   149  156  171

( 1 ,.,.) = 
  148  157  168  ...   147  159  145
  150  160  144  ...   161  141  149
  166  147  153  ...   149  154  169
      ...         ⋱        ...      
  148  159  142  ...   167  143  152
  160  144  158  ...   146  150  162
  146  151  160  ...   145  157  139

( 2 ,.,.) = 
  146  158  140  ...   166  146  151
  159  144  158  ...   146  150  164
  146  151  164  ...   143  156  137
      ...         ⋱        ...      
  157  147  149  ...   140  147  153
  140  145  153  ...   156  162  151
  157  162  149  ...   172  153  157
[torch.DoubleTensor of size 3x256x256]
, 
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 1
 0
 0
 0
 0
[torch.LongTensor of size 17]
)


In [14]:
simple_model = nn.Sequential(
                nn.Conv2d(3, 3, kernel_size=3, stride=1),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(3),
                nn.MaxPool2d(kernel_size=2,stride=2),
                Flatten(),
                nn.Linear(48387,17)
              )
gpu_dtype = torch.cuda.FloatTensor
simple_model.type(gpu_dtype)

#check output dimensions before flattening
model_gpu = copy.deepcopy(simple_model).type(gpu_dtype)
model_gpu.eval()
x = torch.randn(10, 3, 256, 256).type(gpu_dtype)
x_var = Variable(x.type(gpu_dtype)) # Construct a PyTorch Variable out of your input data
scores = model_gpu(x_var)        # Feed it through the model! 
print(scores.size())

loss_fn = nn.MultiLabelSoftMarginLoss().type(gpu_dtype)
optimizer = optim.RMSprop(simple_model.parameters(), lr=1e-3, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

torch.Size([10, 17])


In [15]:
simple_model.train()

print_every = 1

# Load one batch at a time.
for t, (x, y) in enumerate(loader_train):
    x_var = Variable(x.type(gpu_dtype))
    y_var = Variable(y.type(gpu_dtype))

    # This is the forward pass: predict the scores for each class, for each x in the batch.
    scores = simple_model(x_var)
    
    # Use the correct y values and the predicted y values to compute the loss.
    loss = loss_fn(scores, y_var)
    
    if (t + 1) % print_every == 0:
        print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

    # Zero out all of the gradients for the variables which the optimizer will update.
    optimizer.zero_grad()
    
    # This is the backwards pass: compute the gradient of the loss with respect to each 
    # parameter of the model.
    loss.backward()
    
    # Actually update the parameters of the model using the gradients computed by the backwards pass.
    optimizer.step()

t = 1, loss = 0.7058
t = 2, loss = 6.9440
t = 3, loss = 3.6909
t = 4, loss = 12.1577
t = 5, loss = 5.1227
t = 6, loss = 5.1815
t = 7, loss = 4.4140
t = 8, loss = 2.6482
t = 9, loss = 2.6891
t = 10, loss = 2.9625


# Now, I'll try making a dataset subclass which does batch pulls of the data into RAM

Idea - ONLY change Dataset subclass. Don't want to touch dataloader at all.

In [16]:
train_data_path = './input/train-jpg/'
test_data_path = './input/test-jpg/'

train_labels_path ='./input/train_v2.csv' 
test_labels_path = None #it's not provided

#pd.read_csv(train_labels_path)

NUM_TRAIN = 32000
NUM_VAL = train.shape[0]-NUM_TRAIN
NUM_TEST = test.shape[0]

from torch.utils.data.dataset import Dataset
from torch.utils.data.sampler import SequentialSampler
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from PIL import Image


#this is the naive implementation which pulls from file every time you get an item. no caching
class NaiveDataset(Dataset):
    """Dataset wrapping data and target tensors. Naive implementation does data preprocessing per 'get_item' call

    Each sample will be retrieved by indexing both tensors along the first
    dimension.
    
    Arguments:
        data_path (str): path to image folder
        labels_path (str): path to csv containing labels per image
        num_examples (int): number of examples
    """
    def load_image(self, idx):
        image_name = self.labels_df['image_name'][idx]
        im = Image.open(self.data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]
        im = np.reshape(im,(im.shape[2],im.shape[0],im.shape[1]))
        return torch.from_numpy(im)
    
    def __init__(self, data_path=train_data_path, labels_path=train_labels_path,
                 num_examples=1000):
        self.labels_df = pd.read_csv(labels_path)
        assert num_examples <= self.labels_df.shape[0]
        self.num_examples = num_examples
        
        mlb = MultiLabelBinarizer()
        labels_words = [set([word for word in row.split()]) for row in self.labels_df['tags']]
        self.labels_tensor = torch.from_numpy(mlb.fit_transform(labels_words))
        
        self.data_path = data_path

    def __getitem__(self, idx):
        data_tensor = self.load_image(idx)
        target_tensor = self.labels_tensor[idx]
        return data_tensor,target_tensor

    def __len__(self):
        return self.num_examples

class DynamicDataset(Dataset):
    """Dataset wrapping data and target tensors with dynamic loading and buffering

    Each sample will be retrieved by indexing both tensors along the first
    dimension.
    
    Precondition - buffer_size must be a multiple of num_examples (relax this later)

    Arguments:
        data_path (str): path to image folder
        labels_path (str): path to csv containing labels per image
        num_examples (int): number of examples
        buffer_size (int): size of precaching buffer
        rand_seed (None/int): if None, go sequentially. If <0, use system clock for seed. If >0, use seed value
    """

    def __init__(self, data_path=train_data_path, labels_path=train_labels_path,
                 num_examples=1000, buffer_size=1000, rand_seed = None):
        self.labels_df = pd.read_csv(labels_path)
        assert num_examples <= self.labels_df.shape[0]
        assert num_examples >= buffer_size
        assert num_examples % buffer_size == 0
        
        mlb = MultiLabelBinarizer()
        labels_words = [set([word for word in row.split()]) for row in self.labels_df['tags']]
        self.labels_tensor = torch.from_numpy(mlb.fit_transform(labels_words))
        
        self.num_examples = num_examples
        self.buffer_size = buffer_size
        self.rand_seed = 0
        self.buffer_index = 0
        
        if rand_seed is None:
            self.inds_array = np.arange(num_examples)
        elif rand_seed<=0:
            self.inds_array = np.random.permutation(num_examples)
        elif rand_seed>0:
            np.random.seed(rand_seed)
            torch.manual_seed(rand_seed)
            self.inds_array = np.random.permutation(num_examples)

        self.data_path = data_path
        
        self.data_tensor = np.zeros((buffer_size,3,256,256))
        self.backup_buffer = np.zeros(self.data_tensor.shape)
        
        for i in range(self.buffer_index*buffer_size,self.buffer_index*buffer_size+1):
            self.data_tensor[i,:,:,:] = self.load_image(i)
        self.data_tensor = torch.from_numpy(self.data_tensor)
        
    def load_image(self, idx):
        image_name = self.labels_df['image_name'][idx]
        im = Image.open(self.data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]
        im = np.reshape(im,(im.shape[2],im.shape[0],im.shape[1]))
        return im  
    
    def fill_buffer(self):
        self.backup_buffer = np.zeros((self.buffer_size,3,256,256)) #does this clear the GPU RAM properly? Monitor memory..
        self.buffer_index += 1
        for i in range(int(self.buffer_size)):
            self.backup_buffer[i,:,:,:] = self.load_image(i+self.buffer_index*self.buffer_size)
        self.backup_buffer = torch.from_numpy(self.backup_buffer)

    def __getitem__(self, index):
        if index>self.buffer_index*self.buffer_size/2:
            self.fill_buffer()
        elif index>=self.buffer_index*self.buffer_size:
            self.data_tensor = self.backup_buffer #does this do assignment properly w/o causing a GPU/CPU RAM memory leak?
        return self.data_tensor[index%self.buffer_size], self.labels_tensor[index%self.buffer_size]

    def __len__(self):
        return self.data_tensor.size(0)

#Since dataloaders are created in conjunction with samplers, and because of our RAM constraint when loading data,
#We needed to create this helper function to produce a dataloader object with the appropraite sampler. Without this helper
#function, there is a risk that the Sampler would not be able to sample random pictures properly
def createFastLoaderWithSampler(data_path=train_data_path, labels_path=train_labels_path,
                 num_examples=1000, buffer_size=1000, rand_seed = None, batch_size=100):
    dynamic_dataset = DynamicDataset(data_path, labels_path,
                 num_examples, buffer_size, rand_seed)
    return torch.utils.data.DataLoader(dynamic_dataset, batch_size=100, 
                                       shuffle=(rand_seed is not None), sampler=SequentialSampler(dynamic_dataset))
    

In [26]:
train_naive_dataset = NaiveDataset(num_examples=2000)
train_dynamic_loading_dataset = DynamicDataset()
loader_naive_train = DataLoader(train_naive_dataset, batch_size=100,num_workers=8)

In [29]:
%%timeit
simple_model.train()

train_naive_dataset = NaiveDataset(num_examples=2000)
loader_naive_train = torch.utils.data.DataLoader(train_naive_dataset, batch_size=100, shuffle=True,num_workers=8)

print_every = 1

# Load one batch at a time.
for t, (x, y) in enumerate(loader_naive_train):
    x_var = Variable(x.type(gpu_dtype))
    y_var = Variable(y.type(gpu_dtype))

    # This is the forward pass: predict the scores for each class, for each x in the batch.
    scores = simple_model(x_var)
    
    # Use the correct y values and the predicted y values to compute the loss.
    loss = loss_fn(scores, y_var)
    
    if (t + 1) % print_every == 0:
        print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

    # Zero out all of the gradients for the variables which the optimizer will update.
    optimizer.zero_grad()
    
    # This is the backwards pass: compute the gradient of the loss with respect to each 
    # parameter of the model.
    loss.backward()
    
    # Actually update the parameters of the model using the gradients computed by the backwards pass.
    optimizer.step()

t = 1, loss = 0.2865
t = 2, loss = 0.3354
t = 3, loss = 0.2689
t = 4, loss = 0.2511
t = 5, loss = 0.2372
t = 6, loss = 0.2993
t = 7, loss = 0.3771
t = 8, loss = 0.2668
t = 9, loss = 0.3461
t = 10, loss = 0.2548
t = 11, loss = 0.2781
t = 12, loss = 0.2333
t = 13, loss = 0.3106
t = 14, loss = 0.2612
t = 15, loss = 0.2355
t = 16, loss = 0.3117
t = 17, loss = 0.2764
t = 18, loss = 0.3994
t = 19, loss = 0.3268
t = 20, loss = 0.3256
t = 1, loss = 0.2613
t = 2, loss = 0.2629
t = 3, loss = 0.2951
t = 4, loss = 0.2804
t = 5, loss = 0.2417
t = 6, loss = 0.3429
t = 7, loss = 0.2466
t = 8, loss = 0.2273
t = 9, loss = 0.2541
t = 10, loss = 0.2712
t = 11, loss = 0.3139
t = 12, loss = 0.2494
t = 13, loss = 0.4289
t = 14, loss = 0.3762
t = 15, loss = 0.2689
t = 16, loss = 0.2755
t = 17, loss = 0.2193
t = 18, loss = 0.4387
t = 19, loss = 0.2702
t = 20, loss = 0.3132
t = 1, loss = 0.2780
t = 2, loss = 0.3132
t = 3, loss = 0.2473
t = 4, loss = 0.3316
t = 5, loss = 0.2630
t = 6, loss = 0.2445
t = 7, loss 

In [24]:
from data_utils import createFastLoaderWithSampler

In [31]:
%%timeit
NUM_TRAIN = 2000
BUFFER_SIZE = 2000

loader_fast_train = createFastLoaderWithSampler(num_examples=NUM_TRAIN,buffer_size=BUFFER_SIZE,num_workers=8)
#print(len(loader_fast_train))

print_every = 1

# Load one batch at a time.
for t, (x, y) in enumerate(loader_fast_train):
    x_var = Variable(x.type(gpu_dtype))
    y_var = Variable(y.type(gpu_dtype))

    # This is the forward pass: predict the scores for each class, for each x in the batch.
    scores = simple_model(x_var)
    
    # Use the correct y values and the predicted y values to compute the loss.
    loss = loss_fn(scores, y_var)
    
    if (t + 1) % print_every == 0:
        print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

    # Zero out all of the gradients for the variables which the optimizer will update.
    optimizer.zero_grad()
    
    # This is the backwards pass: compute the gradient of the loss with respect to each 
    # parameter of the model.
    loss.backward()
    
    # Actually update the parameters of the model using the gradients computed by the backwards pass.
    optimizer.step()

2000
t = 1, loss = 0.2680
t = 2, loss = 0.2649
t = 3, loss = 0.3026
t = 4, loss = 0.3373
t = 5, loss = 0.2515
t = 6, loss = 0.3438
t = 7, loss = 0.2337
t = 8, loss = 0.3320
t = 9, loss = 0.2483
t = 10, loss = 0.3133
t = 11, loss = 0.2506
t = 12, loss = 0.2638
t = 13, loss = 0.2906
t = 14, loss = 0.2933
t = 15, loss = 0.2777
t = 16, loss = 0.3015
t = 17, loss = 0.3176
t = 18, loss = 0.3330
t = 19, loss = 0.2560
t = 20, loss = 0.3259
2000
t = 1, loss = 0.2681
t = 2, loss = 0.2656
t = 3, loss = 0.3030
t = 4, loss = 0.3245
t = 5, loss = 0.2514
t = 6, loss = 0.3341
t = 7, loss = 0.2306
t = 8, loss = 0.3288
t = 9, loss = 0.2482
t = 10, loss = 0.2482
t = 11, loss = 0.2395
t = 12, loss = 0.2587
t = 13, loss = 0.2413
t = 14, loss = 0.3080
t = 15, loss = 0.2679
t = 16, loss = 0.2779
t = 17, loss = 0.3173
t = 18, loss = 0.3188
t = 19, loss = 0.2754
t = 20, loss = 0.3255
2000
t = 1, loss = 0.2675
t = 2, loss = 0.2658
t = 3, loss = 0.3032
t = 4, loss = 0.2889
t = 5, loss = 0.2514
t = 6, loss = 0.33