In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import sys
import os
from os.path import join as pjoin
from glob import glob
sys.path.append('..')

In [29]:
import cv2
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from utils.commands import unzip, make_data_dir, mkdir, call, count_file, KaggleCLI, execute_in

cli = KaggleCLI('galaxy-zoo-the-galaxy-challenge')

In [12]:
data_path = 'data'
model_path = 'models'
sample_path = pjoin(data_path, 'sample')

## Data preprocess

In [11]:
cli.download_data()

In [8]:
for zfile in os.listdir(data_path):
    if zfile.endswith('.zip'):
        unzip(pjoin(data_path, zfile), data_path)

In [13]:
train_name = 'images_training_rev1'
valid_name = 'images_valid_rev1'
test2_name = 'images_test_rev2'
test_name = 'images_test_rev1'

In [14]:
mkdir(pjoin(sample_path, train_name))


In [20]:
def settle_data():
    train_data, sample_data = train_test_split(os.listdir(train_name), test_size=0.01)
    for s in sample_data:
        call('cp {}/{} sample/{}/'.format(train_name, s, train_name))

execute_in('data/', settle_data)

## Loading Data

In [34]:
work_path = sample_path

In [49]:
def files_to_dict(files):
    result = {}
    for f in files:
        img_arr = cv2.imread('{}/{}'.format(train_name, f))
        result[f.split('.')[0]] = cv2.resize(img_arr, (224, 224), interpolation=cv2.INTER_AREA)
    return result

def labeled_data(data, label):
    return zip(*[[v, label[k]] for k, v in data.iteritems()])

def load_data():
    train_files, test_files = train_test_split(os.listdir(train_name), test_size=0.3)
    valid_files, test_files = train_test_split(test_files, test_size=0.4)
    
    
    train_data = files_to_dict(train_files)
    valid_data = files_to_dict(valid_files)
    test_data = files_to_dict(test_files)
    
    with open('training_solutions_rev1.csv', 'rb') as f:
        reader = csv.reader(f)
        reader.next()
        label_data = {}
        for row in reader:
            label_data[row[0]] = [float(num) for num in row[1:]]
    
    return train_data, valid_data, test_data, label_data

train_data, valid_data, test_data, label_data = execute_in(work_path, load_data)

train_data = labeled_data(train_data, label_data)
test_data = labeled_data(test_data, label_data)
valid_data = labeled_data(valid_data, label_data)

## Fine tune VGG

In [47]:
from utils.pretrained_models import VGG16
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint

In [48]:
vgg_model = VGG16.get_model(37).model

In [51]:
print train_data[0][0], train_data[1][0]

[[[4 4 4]
  [2 2 2]
  [2 2 2]
  ...
  [0 2 0]
  [0 3 1]
  [0 3 1]]

 [[4 4 4]
  [1 1 1]
  [0 0 0]
  ...
  [0 2 0]
  [0 3 1]
  [0 3 1]]

 [[2 2 2]
  [1 1 1]
  [0 0 0]
  ...
  [0 2 0]
  [0 3 1]
  [0 3 1]]

 ...

 [[2 2 2]
  [2 2 2]
  [5 5 5]
  ...
  [4 4 4]
  [3 3 3]
  [3 3 3]]

 [[4 4 4]
  [4 4 4]
  [3 3 3]
  ...
  [4 4 4]
  [4 4 4]
  [2 2 2]]

 [[4 4 4]
  [5 5 5]
  [3 3 3]
  ...
  [2 2 2]
  [2 2 2]
  [0 0 0]]] [0.1978, 0.8022, 0.0, 0.8022, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.038699, 0.961301, 0.0, 0.0, 0.1978, 0.0, 0.0, 0.038699, 0.0, 0.0, 0.0, 0.0, 0.171310612, 0.044789233, 0.586100155, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [54]:
train_datagen = ImageDataGenerator()
valid_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()
train_flow = train_datagen.flow(np.array(train_data[0]), np.array(train_data[1]))
valid_flow = valid_datagen.flow(np.array(valid_data[0]), np.array(valid_data[1]))
test_flow = test_datagen.flow(np.array(test_data[0]), np.array(test_data[1]))

In [55]:
for l in vgg_model.layers[:-1]:
    l.trainable = False

In [56]:
vgg_model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
checkpointer = ModelCheckpoint(pjoin(model_path, 'weights_best.hdf5'), save_best_only=True)
vgg_model.fit_generator(
        train_flow,
        steps_per_epoch=10,
        epochs=1,
        validation_data=valid_flow,
        validation_steps=10,
        callbacks=[checkpointer])

Epoch 1/1

IOError: Unable to create file (unable to open file: name = 'models/weights_best.hdf5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 602)

In [13]:
vgg_model.load_weights(pjoin(model_path, 'weights_best.hdf5'))
vgg_model.evaluate_generator(valid_flow)

[3.3346356303809443e-05, 1.0]

In [14]:
preds = vgg_model.predict_generator(test_flow)

In [18]:
isdog = preds[:,1]
ids = np.array([int(f.split('.')[-2].split('/')[1]) for f in test_flow.filenames])
submission = np.stack([ids, isdog], axis=1)

In [19]:
result_path = pjoin(data_path, 'submission.csv')
np.savetxt(result_path, submission, fmt='%d, %.5f', header='id,label', comments="")

In [None]:
cli.submit_result(result_path)