### Preparation of Training and Validation Sets

In [1]:
import os

In [4]:
lesson_dir = os.getcwd()
data_dir = os.path.join(lesson_dir, 'data')
if not os.path.isdir(data_dir): os.mkdir(data_dir)

In [5]:
train_dir = os.path.join(data_dir, 'train')
categories = [dir for dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, dir))]

In [4]:
for directory in ['full', 'sample']:
    for subdirectory in ['train', 'validation']:
        for category in categories:
            full_dir = os.path.join(data_dir, directory, subdirectory, category)
            if not os.path.isdir(full_dir): os.makedirs(full_dir)

In [5]:
def copyfiles(list_file_paths, target_dir):
    for path in list_file_paths: shutil.copy(path, target_dir)

In [6]:
import shutil
import glob
from sklearn.cross_validation import train_test_split
sample_size = 10
for category in categories:
    img_files = glob.glob(os.path.join(train_dir, category, 'img*'))
    train_img, val_img = train_test_split(img_files, test_size=0.15)
    copyfiles(train_img, os.path.join(data_dir, 'full', 'train', category))
    copyfiles(val_img, os.path.join(data_dir, 'full', 'validation', category))
    copyfiles(train_img[:sample_size], os.path.join(data_dir, 'sample', 'train', category))
    copyfiles(val_img[:sample_size], os.path.join(data_dir, 'sample', 'validation', category))

### Load VGG16 Model and Weights

In [2]:
import sys
sys.path.insert(1, '../')
from vgg16 import Vgg16
from utils import *

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [6]:
#model_data_dir = os.path.join(data_dir, 'sample')
model_data_dir = os.path.join(data_dir, 'full')

In [7]:
vgg16 = Vgg16()

In [8]:
def pop_model_layers(model, nlayers):
    for i in range(nlayers):
        model.pop()

In [9]:
pop_model_layers(vgg16.model, 2)

### Get VGG16 Features

In [10]:
train_batches = get_batches(os.path.join(model_data_dir, 'train'), shuffle=False, batch_size=1, target_size=(224,224))
val_batches = get_batches(os.path.join(model_data_dir, 'validation'), shuffle=False, batch_size=1, target_size=(224,224))

Found 19056 images belonging to 10 classes.
Found 3368 images belonging to 10 classes.


In [11]:
train_labels = onehot(train_batches.classes)
val_labels = onehot(val_batches.classes)

In [12]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [13]:
#train_features = vgg16.model.predict_generator(train_batches, train_batches.N)
train_features = load_array('train_features.bc')

In [14]:
#val_features = vgg16.model.predict_generator(val_batches, val_batches.N)
val_features = load_array('val_features.bc')

In [15]:
val_features.shape

(3368, 4096)

In [16]:
val_labels.shape

(3368, 10)

In [17]:
from keras.models import Sequential
lm = Sequential()
lm.add(Dense(10, activation="softmax", input_shape=(4096,)))
lm.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

In [23]:
lm.fit(train_features, train_labels, batch_size=len(train_features), nb_epoch=10, validation_data=(val_features, val_labels));

Train on 19056 samples, validate on 3368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
lm.save_weights('linear.h5')

In [21]:
save_array('train_features.bc', train_features)
save_array('val_features.bc', val_features)

### Predictions

In [29]:
test_batches = get_batches(os.path.join(data_dir, 'test'), shuffle=False, batch_size=1, target_size=(224,224))

Found 79726 images belonging to 1 classes.


In [25]:
#test_features = vgg16.model.predict_generator(test_batches, test_batches.N)
test_features = load_array('test_features.bc')

In [32]:
save_array('test_features.bc', test_features)

In [26]:
test_preds = lm.predict_proba(test_features)



In [27]:
test_preds

array([[  1.1701e-01,   8.7711e-03,   1.9402e-04, ...,   3.7086e-02,   9.3559e-02,   6.3164e-09],
       [  4.1925e-01,   2.0548e-02,   1.4706e-04, ...,   1.0565e-01,   8.1237e-02,   2.2366e-10],
       [  7.0407e-05,   4.6851e-06,   1.7796e-01, ...,   2.3803e-05,   1.2520e-01,   1.7776e-12],
       ..., 
       [  7.8182e-03,   1.1924e-03,   2.9820e-05, ...,   4.5712e-02,   8.9333e-02,   2.1313e-10],
       [  8.5779e-01,   2.1967e-03,   3.9021e-05, ...,   5.3938e-03,   6.2099e-02,   5.5776e-11],
       [  2.2212e-01,   7.4201e-04,   1.1398e-03, ...,   8.9940e-04,   1.6070e-03,   3.1491e-13]], dtype=float32)

In [28]:
test_preds.shape

(79726, 10)

### Submit Preds

In [31]:
labels = np.clip(test_preds, 0.0125, 0.9875)

In [33]:
ids = [filename.split("/")[1] for filename in test_batches.filenames]

In [34]:
submission = np.column_stack([ids, labels])

In [None]:
np.savetxt('redux1.csv', submission, delimiter=',', header='img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9', fmt="%s", comments='')