In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import sys
import os
from os.path import join as pjoin
from glob import glob
sys.path.append('..')

In [2]:
import cv2
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from utils.commands import unzip, mkdir, call, count_file, KaggleCLI, execute_in

cli = KaggleCLI('state-farm-distracted-driver-detection')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [12]:
data_path = pjoin(os.getcwd(), 'data')
model_path = pjoin(os.getcwd(), 'models')
sample_path = pjoin(data_path, 'sample')
train_name = 'train'
valid_name = 'valid'
test_name = 'ttest'

## Data preprocess

In [4]:
cli.download_data()

In [5]:
for zfile in os.listdir(data_path):
    if zfile.endswith('.zip'):
        unzip(pjoin(data_path, zfile), data_path)

In [17]:
def mk_data_dirs():
    cgs = os.listdir(train_name)
    for name in [train_name, valid_name, test_name]:    
        for cg in cgs:
            mkdir(pjoin(name, cg))
            mkdir(pjoin(sample_path, name, cg))
            
execute_in(data_path, mk_data_dirs)
mkdir(model_path)

In [18]:
def settle_data():
    categories = os.listdir(train_name)
    for cg in categories:
        train_cg_path = pjoin(train_name, cg)
        train, test = train_test_split(os.listdir(train_cg_path), test_size=0.3)
        valid, test = train_test_split(test, test_size=0.4)
        
        for ds, ds_name in [(valid, valid_name), (test, test_name)]:
            for fn in ds:
                call('mv {} {}'.format(pjoin(train_cg_path, fn), pjoin(ds_name, cg)))
        
        for ds, ds_name in [(valid, valid_name), (test, test_name), (train, train_name)]:
            _, sample = train_test_split(ds, test_size=0.1)
            for fn in sample:
                call('cp {} {}'.format(pjoin(ds_name, cg, fn), pjoin(sample_path, ds_name, cg)))

execute_in(data_path, settle_data)

## Loading Data

In [20]:
work_path = sample_path

## Fine tune VGG

In [21]:
from utils.pretrained_models import VGG16
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint

In [22]:
vgg_model = VGG16.get_model(10).model

In [29]:
train_datagen = ImageDataGenerator()
valid_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

batch_size = 32

train_flow = train_datagen.flow_from_directory(
    pjoin(work_path, train_name), 
    target_size=(224, 224),
    class_mode='categorical',
    shuffle=True,
    batch_size=batch_size,
)
valid_flow = valid_datagen.flow_from_directory(
    pjoin(work_path, valid_name), 
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=batch_size,
)
test_flow = test_datagen.flow_from_directory(
    pjoin(work_path, train_name), 
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=batch_size,
)

Found 1574 images belonging to 10 classes.
Found 407 images belonging to 10 classes.
Found 1574 images belonging to 10 classes.


In [25]:
for l in vgg_model.layers[:-1]:
    l.trainable = False

In [30]:
train_steps = train_flow.samples // batch_size
valid_steps = valid_flow.samples // batch_size
epochs = 3

vgg_model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
checkpointer = ModelCheckpoint(pjoin(model_path, 'weights_best.hdf5'), save_best_only=True)
vgg_model.fit_generator(
        train_flow,
        steps_per_epoch=train_steps,
        epochs=epochs,
        validation_data=valid_flow,
        validation_steps=valid_steps,
        callbacks=[checkpointer])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f03cccf92d0>

In [33]:
vgg_model.load_weights(pjoin(model_path, 'weights_best.hdf5'))
print vgg_model.evaluate_generator(test_flow)
print vgg_model.metrics_names

[0.48031948902555466, 0.8595933926302414]
['loss', 'acc']


In [17]:
preds = vgg_model.predict_generator(test_flow)

In [19]:
print len(preds[0])

37
