### imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import PIL.ImageOps
from PIL import Image
from sklearn.model_selection import StratifiedKFold

import env
from utils import KaggleCameraDataset, progress_iter

%matplotlib inline
%load_ext autoreload
%autoreload 2

## load training data

In [2]:
train_data = KaggleCameraDataset('../data/', train=True, lazy=True)

## calculate dataset size

In [None]:
crop_size = 256
n_crops = 8
N = len(train_data)

size = crop_size**2*3*n_crops*N
print "Dataset will occupy {0:.2f} GB".format(size/float(2**30))

Dataset will occupy 4.03 GB


## gen aug data (center crops + D4 group)

In [None]:
X = np.zeros((N * 8, crop_size, crop_size, 3), dtype=np.uint8)

pos = 0
for x, _ in progress_iter(train_data, verbose=True):
    w = x.size[0]
    h = x.size[1]
    x = x.crop((w/2-crop_size/2, h/2-crop_size/2,
                w/2+crop_size/2, h/2+crop_size/2))
    
    X[pos, ...] = np.asarray(x, dtype=np.uint8)
    pos += 1
    X[pos, ...] = np.asarray(x.transpose(Image.ROTATE_90), dtype=np.uint8)
    pos += 1
    X[pos, ...] = np.asarray(x.transpose(Image.ROTATE_180), dtype=np.uint8)
    pos += 1
    X[pos, ...] = np.asarray(x.transpose(Image.ROTATE_270), dtype=np.uint8)
    pos += 1
    
    x = PIL.ImageOps.mirror(x)
    
    X[pos, ...] = np.asarray(x, dtype=np.uint8)
    pos += 1
    X[pos, ...] = np.asarray(x.transpose(Image.ROTATE_90), dtype=np.uint8)
    pos += 1
    X[pos, ...] = np.asarray(x.transpose(Image.ROTATE_180), dtype=np.uint8)
    pos += 1
    X[pos, ...] = np.asarray(x.transpose(Image.ROTATE_270), dtype=np.uint8)
    pos += 1

A Jupyter Widget

In [None]:
y = np.asarray(train_data.y).repeat(8)
y.shape

## split into folds
### what I'm gonna do

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)
z = np.asarray([0, 1]).repeat(5)
for train_ind, test_ind in skf.split(z, z):
    print train_ind, test_ind

new_ind = np.concatenate( [test_ind for _, test_ind in skf.split(z, z)] )
print new_ind

In [None]:
new_ind = np.concatenate( [test_ind for _, test_ind in skf.split(y, y)] )
print new_ind.shape

In [None]:
X = X[new_ind]
y = y[new_ind]

### reshape for easy extraction of train, val

In [None]:
X = X.reshape(len(X)/5, 5, 256, 256, 3).transpose((1, 0, 2, 3, 4))
X.shape

In [None]:
y = y.reshape(len(y)/5, 5).T
y.shape

### save

In [None]:
np.save('../data/X_folds.npy', X)
np.save('../data/y_folds.npy', y)