# action plan

0 - download data from plankton competition

1 - create validation/sample set

2 - rearrange image files to proper directories

3 - get vgg16 model setup with keras

4 - finetune and train model

5 - generate predictions

6 - validate predictions

7 - submit predictions to kaggle

# 0 - download data

In [15]:
%pwd

'/home/yns207/nbs/machine_learning'

In [16]:
import os,sys
current_dir = os.getcwd()
CUR_DIR = current_dir
DATA_DIR = os.path.join(current_dir,'data')
print(CUR_DIR, DATA_DIR)

/home/yns207/nbs/machine_learning /home/yns207/nbs/machine_learning/data


In [17]:
%%bash
kg config -g -u 'yvanscher' -p 'blackmark1' -c 'datasciencebowl'
mkdir -p data 
cd data
kg download

downloading https://www.kaggle.com/c/datasciencebowl/download/sampleSubmission.csv.zip

downloading https://www.kaggle.com/c/datasciencebowl/download/train.zip

downloading https://www.kaggle.com/c/datasciencebowl/download/test.zip

downloading https://www.kaggle.com/c/datasciencebowl/download/plankton_identification.pdf



                                                                               sampleSubmission.csv.zip N/A% |                      | ETA:  --:--:--   0.0 s/B                                                                               sampleSubmission.csv.zip   0% |                      | ETA: 0:04:59   2.8 KiB/s                                                                               sampleSubmission.csv.zip   7% |#                     | ETA: 0:00:05 153.9 KiB/s                                                                               sampleSubmission.csv.zip  22% |####                  | ETA: 0:00:01 421.1 KiB/s                                                                               sampleSubmission.csv.zip  58% |############          | ETA: 0:00:00 977.6 KiB/s                                                                               sampleSubmission.csv.zip 100% |#####################| Time: 0:00:00   1.5 MiB/s
                                

In [18]:
%cd $DATA_DIR
!unzip -q test.zip
!unzip -q train.zip

/home/yns207/nbs/machine_learning/data


In [19]:
%ls -l $DATA_DIR
%ls -l $DATA_DIR/test/ | wc -l
%ls -l $DATA_DIR/train/ | wc -l

total 451371
-rw-r-----   1 yns207 yns207   1621177  9 mai   15:51 plankton_identification.pdf
-rw-r-----   1 yns207 yns207    870561  9 mai   15:50 [0m[38;5;9msampleSubmission.csv.zip[0m
drwxr-sr-x   2 yns207 yns207    130402  9 déc.   2014 [38;5;27mtest[0m/
-rw-r-----   1 yns207 yns207 363454024  9 mai   15:51 [38;5;9mtest.zip[0m
drwxr-sr-x 123 yns207 yns207       123  9 déc.   2014 [38;5;27mtrain[0m/
-rw-r-----   1 yns207 yns207  79921404  9 mai   15:50 [38;5;9mtrain.zip[0m
130401
122


In [26]:
%mkdir  $DATA_DIR/test/unknown
%mv $DATA_DIR/test/*.jpg  $DATA_DIR/test/unknown/

mkdir: impossible de créer le répertoire « /home/yns207/nbs/machine_learning/data/test/unknown »: Le fichier existe


# 1 - create validation and sample set

In [20]:
from shutil import copytree, copyfile, ignore_patterns
import numpy as np
from glob import glob

In [21]:
%cd $DATA_DIR
%mkdir -p models
%mkdir -p sample/test sample/models
%mkdir -p test/unknown

/home/yns207/nbs/machine_learning/data


In [22]:
%cd $DATA_DIR/train

/home/yns207/nbs/machine_learning/data/train


In [23]:
# make a copy of the training folder tree
# copy all folders but not jpgs within them
copytree(os.path.join(DATA_DIR,'train'), os.path.join(DATA_DIR,'valid'), ignore=ignore_patterns('*.jpg'))

'/home/yns207/nbs/machine_learning/data/valid'

In [24]:
# split training data and validation 
# sample 2000 jpgs randomly,
# replace their original path
# up to train/ with valid/
# the idea here is to match the original distribution
# but it may be wise to come back later 
# and make sure theres at least 5 from each category
g = glob(os.path.join(DATA_DIR,'train','*','*.jpg'))
shuff = np.random.permutation(g)
for i in range(2000):
    p = shuff[i]
    pnew = p.replace(p[:p.index("train/")+6],os.path.join(DATA_DIR,'valid/')) 
    os.rename(p, pnew)

In [25]:
%ls -lah $DATA_DIR/valid/* | wc -l

2604


In [26]:
# make sample tree for validation set
copytree(os.path.join(DATA_DIR,'train'), os.path.join(DATA_DIR,'sample','valid'), ignore=ignore_patterns('*.jpg'))

'/home/yns207/nbs/machine_learning/data/sample/valid'

In [27]:
# from validation set we just made 
# copy files into sample
g = glob(os.path.join(DATA_DIR,'valid','*','*.jpg'))
shuff = np.random.permutation(g)
for i in range(50):
    p = shuff[i]
    pnew = p.replace(p[:p.index("valid/")+6],os.path.join(DATA_DIR,'sample','valid/')) 
    copyfile(p, pnew)

In [28]:
copytree(os.path.join(DATA_DIR,'train'), os.path.join(DATA_DIR,'sample','train'), ignore=ignore_patterns('*.jpg'))

'/home/yns207/nbs/machine_learning/data/sample/train'

In [29]:
# from training set
# copy files into sample training
g = glob(os.path.join(DATA_DIR,'train','*','*.jpg'))
shuff = np.random.permutation(g)
for i in range(200):
    p = shuff[i]
    pnew = p.replace(p[:p.index("train/")+6],os.path.join(DATA_DIR,'sample','train/')) 
    copyfile(p, pnew)

In [30]:
%ls -lah $DATA_DIR/sample/train/* | head -n 20
%ls -lah $DATA_DIR/sample/valid/* | head -n 20

/home/yns207/nbs/machine_learning/data/sample/train/acantharia_protist:
total 3,0K
drwxr-sr-x   2 yns207 yns207    6  9 mai   15:53 [0m[38;5;27m.[0m/
drwxr-sr-x 123 yns207 yns207  123  9 déc.   2014 [38;5;27m..[0m/
-rw-r-----   1 yns207 yns207  775  9 mai   15:53 [38;5;13m122445.jpg[0m
-rw-r-----   1 yns207 yns207 1,6K  9 mai   15:53 [38;5;13m155350.jpg[0m
-rw-r-----   1 yns207 yns207 1,3K  9 mai   15:53 [38;5;13m159648.jpg[0m
-rw-r-----   1 yns207 yns207 1,9K  9 mai   15:53 [38;5;13m8914.jpg[0m

/home/yns207/nbs/machine_learning/data/sample/train/acantharia_protist_big_center:
total 1,0K
drwxr-sr-x   2 yns207 yns207   2  9 déc.   2014 [38;5;27m.[0m/
drwxr-sr-x 123 yns207 yns207 123  9 déc.   2014 [38;5;27m..[0m/

/home/yns207/nbs/machine_learning/data/sample/train/acantharia_protist_halo:
total 1,0K
drwxr-sr-x   2 yns207 yns207   2  9 mai   15:53 [38;5;27m.[0m/
drwxr-sr-x 123 yns207 yns207 123  9 déc.   2014 [38;5;27m..[0m/

/home/yns207/nbs/machine_learning/data/

# get vgg model setup with keras

In [2]:
import h5py
import numpy as np
import PIL

from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop, Adam
from keras.preprocessing import image

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5105)


In [3]:
FILE_PATH = 'http://files.fast.ai/models/'

In [4]:
# load in the vgg imagenet model
get_file('vgg16.h5', FILE_PATH + 'vgg16.h5')

'/home/yns207/.keras/datasets/vgg16.h5'

In [5]:
# subtract the mean pixel values and switch the
# order to the order vgg expects, added as a 
# preprocessor to our vgg skeleton
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape((3,1,1))
def vgg_preprocess(x):
    x = x - vgg_mean
    return x[:, ::-1] # reverse axis rgb->bgr

In [6]:
# reset model from previous run
model = None

# create the skeleton of the vgg16
# model
model = Sequential()
model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(64, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(128, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, (3, 3), activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, (3, 3), activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, (3, 3), activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2),))

model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1000, activation='softmax'))

# fill the vgg16 skeleton with the actual trained weights
# of the vgg16 model

model.load_weights(get_file('vgg16.h5', FILE_PATH + 'vgg16.h5'))

# finetune and retrain last layer

In [7]:
# get rid of the last layer and 
# make all the layter untrainable
model.pop()
for layer in model.layers:
    layer.trainable = False
model.add(Dense(121, activation='softmax'))

In [8]:
opt = RMSprop(lr=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# get training and validation data

In [9]:
import os,sys
current_dir = os.getcwd()
CUR_DIR = current_dir
DATA_DIR = os.path.join(current_dir,'data')
path = DATA_DIR
test_path = DATA_DIR + '/test/'
models_path = DATA_DIR + '/models/'
train_path = path + '/train/'
valid_path = path + '/valid/'
batch_size = 64

In [10]:
gen = image.ImageDataGenerator()
tr_batches = gen.flow_from_directory(train_path, target_size=(224,224), class_mode='categorical', shuffle=True, batch_size=batch_size)
va_batches = gen.flow_from_directory(valid_path, target_size=(224,224), class_mode='categorical', shuffle=True, batch_size=batch_size)

Found 28336 images belonging to 121 classes.
Found 2000 images belonging to 121 classes.


# fit model and save weights

In [10]:
model.fit_generator(tr_batches, 
                    steps_per_epoch=tr_batches.n//batch_size, 
                    validation_data=va_batches, 
                    validation_steps=va_batches.n//batch_size,
                    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2b7236105550>

In [181]:
model.fit_generator(tr_batches, 
                    steps_per_epoch=tr_batches.n//batch_size, 
                    validation_data=va_batches, 
                    validation_steps=va_batches.n//batch_size,
                    epochs=1)

Epoch 1/1


<keras.callbacks.History object at 0x2b7470c8eda0>

In [182]:
model.save_weights(os.path.join(models_path,'plankton_finetune_full_short_2.h5'))

# load model, data, and evaluate

In [44]:
model.load_weights(os.path.join(models_path,'plankton_finetune_full_short_2.h5'))

In [45]:
model.evaluate_generator(va_batches, va_batches.n / batch_size, workers=2)

[2.515652633666992, 0.48299999999999998]

# make predictions on test data and upload to kaggle

In [None]:
gen = image.ImageDataGenerator()
test_batches = gen.flow_from_directory(test_path, target_size=(224,224), class_mode=None, batch_size=batch_size)

Found 130400 images belonging to 1 classes.


In [None]:
preds = model.predict_generator(test_batches, test_batches.n / batch_size, workers=2)

In [None]:
preds.shape

In [None]:
preds

In [None]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [None]:
# write predictions
save_array(os.path.join(models_path,'plankton_preds_may122017_2.bc'),preds)

In [None]:
%ls -lah $models_path/

In [61]:
preds.shape

(130400, 121)

In [None]:
filenames = test_batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])
ids.shape

In [None]:
sub = np.column_stack((ids,preds))
sub.shape

In [None]:
sub

In [65]:
unique_classes = sorted([str(label) for label in tr_batches.class_indices])
header = ','.join(unique_classes)
header = 'image,' + header
np.savetxt(os.path.join(models_path, 'kaggle_plankton_submission_2.csv'), sub, header=header, comments='', delimiter=',', fmt=','.join(['%d.jpg'] + ['%.5f']*121))

In [66]:
from IPython.display import FileLink
%cd ~/nbs/machine_learning/
FileLink('data/models/kaggle_plankton_submission_2.csv')

/home/yns207/nbs/machine_learning


https://github.com/fchollet/keras/issues/3945 -negative dimension error
https://github.com/fchollet/keras/issues/3426 -h5py
https://github.com/NathanYee/planktonDataScienceBowl/blob/master/scripts/plankton_model_vgg16.ipynb
http://stackoverflow.com/questions/40030481/numpy-savetxt-save-one-column-as-int-and-the-rest-as-floats
pip install pillow h5py keras tensorflow theano numpy
