In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import sys
import os
from os.path import join as pjoin
from glob import glob
sys.path.append('..')

In [2]:
import cv2
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from utils.commands import unzip, make_data_dir, mkdir, call, count_file, KaggleCLI, execute_in

cli = KaggleCLI('galaxy-zoo-the-galaxy-challenge')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
data_path = 'data'
model_path = 'models'
sample_path = pjoin(data_path, 'sample')
train_name = 'images_training_rev1'

## Data preprocess

In [11]:
cli.download_data()

In [8]:
for zfile in os.listdir(data_path):
    if zfile.endswith('.zip'):
        unzip(pjoin(data_path, zfile), data_path)

In [17]:
mkdir(pjoin(sample_path, train_name))
mkdir(model_path)

In [11]:
def settle_data():
    train_data, sample_data = train_test_split(os.listdir(train_name), test_size=0.01)
    for s in sample_data:
        call('cp {}/{} sample/{}/'.format(train_name, s, train_name))
    call('cp training_solutions_rev1.csv sample/')

execute_in('data/', settle_data)

## Loading Data

In [4]:
work_path = sample_path

In [7]:
def files_to_dict(files):
    result = {}
    for f in files:
        img_arr = cv2.imread('{}/{}'.format(train_name, f))
        result[f.split('.')[0]] = cv2.resize(img_arr, (224, 224), interpolation=cv2.INTER_AREA)
    return result

def labeled_data(data, label):
    return zip(*[[v, label[k]] for k, v in data.iteritems()])

def load_data():
    train_files, test_files = train_test_split(os.listdir(train_name), test_size=0.3)
    valid_files, test_files = train_test_split(test_files, test_size=0.4)
    
    
    train_data = files_to_dict(train_files)
    valid_data = files_to_dict(valid_files)
    test_data = files_to_dict(test_files)
    
    with open('training_solutions_rev1.csv', 'rb') as f:
        reader = csv.reader(f)
        reader.next()
        label_data = {}
        for row in reader:
            label_data[row[0]] = [float(num) for num in row[1:]]
    
    return train_data, valid_data, test_data, label_data

train_data, valid_data, test_data, label_data = execute_in(work_path, load_data)

train_data = labeled_data(train_data, label_data)
test_data = labeled_data(test_data, label_data)
valid_data = labeled_data(valid_data, label_data)

## Fine tune VGG

In [8]:
from utils.pretrained_models import VGG16
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint

In [9]:
vgg_model = VGG16.get_model(37).model

In [10]:
train_datagen = ImageDataGenerator()
valid_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()
train_flow = train_datagen.flow(np.array(train_data[0]), np.array(train_data[1]))
valid_flow = valid_datagen.flow(np.array(valid_data[0]), np.array(valid_data[1]))
test_flow = test_datagen.flow(np.array(test_data[0]), np.array(test_data[1]))

In [11]:
for l in vgg_model.layers[:-1]:
    l.trainable = False

In [13]:
vgg_model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['accuracy'])
checkpointer = ModelCheckpoint(pjoin(model_path, 'weights_best.hdf5'), save_best_only=True)
vgg_model.fit_generator(
        train_flow,
        steps_per_epoch=16,
        epochs=1,
        validation_data=valid_flow,
        validation_steps=16,
        callbacks=[checkpointer])

Epoch 1/1


<keras.callbacks.History at 0x7f19b439c250>

In [15]:
vgg_model.load_weights(pjoin(model_path, 'weights_best.hdf5'))
vgg_model.evaluate_generator(valid_flow)

[0.05667893346902486, 0.576576580335428]

In [17]:
preds = vgg_model.predict_generator(test_flow)

In [19]:
print len(preds[0])

37
