https://keras.io/applications/#usage-examples-for-image-classification-models

In [133]:
import math
import os
import datetime

import numpy as np
import pandas as pd

from keras.preprocessing import image
from keras.layers import Input, Lambda
from keras.models import Model

from keras.applications import xception
from keras.applications import inception_v3

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score

from secrets import KAGGLE_USER, KAGGLE_PW

In [26]:
competition_name = 'dog-breed-identification'
data_dir = '/opt/notebooks/data/' + competition_name + '/preprocessed'

In [27]:
gen = image.ImageDataGenerator()

In [80]:
batch_size = 32
target_size=(299, 299)

Found 8222 images belonging to 120 classes.
Found 2000 images belonging to 120 classes.
Found 10357 images belonging to 1 classes.


In [63]:
def add_preprocess(base_model, preprocess_func, inputs_shape=(299, 299, 3)):
    inputs = Input(shape=inputs_shape)
    x = Lambda(preprocess_func)(inputs)
    outputs = base_model(x)
    model = Model(inputs, outputs)
    return model

### Xception

In [89]:
batches = gen.flow_from_directory(data_dir+'/train', shuffle=False, target_size=target_size, batch_size=batch_size)
batches_val = gen.flow_from_directory(data_dir+'/valid', shuffle=False, target_size=target_size, batch_size=batch_size)

nb_batches = math.ceil(batches.n/batch_size)
nb_batches_val = math.ceil(batches_val.n/batch_size)

y_encode = batches.classes
y_val_encode = batches_val.classes

Found 8222 images belonging to 120 classes.
Found 2000 images belonging to 120 classes.


In [64]:
base_model = xception.Xception(weights='imagenet', include_top=False, pooling='avg')

In [65]:
model_x = add_preprocess(base_model, xception.preprocess_input)

In [66]:
bf_x=model_x.predict_generator(batches, steps=nb_batches, verbose=1)



In [128]:
np.save(data_dir+'/results/bf_x', bf_x)

In [67]:
bf_val_x=model_x.predict_generator(batches_val, steps=nb_batches_val, verbose=1)



In [129]:
np.save(data_dir+'/results/bf_val_x', bf_val_x)

In [68]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg.fit(bf_x, y_encode)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [69]:
valid_probs = logreg.predict_proba(bf_val_x)
valid_preds = logreg.predict(bf_val_x)

In [70]:
log_loss(y_val_encode, valid_probs)

0.32573993342313973

In [71]:
accuracy_score(y_val_encode, valid_preds)

0.90400000000000003

### Inception

In [90]:
batches = gen.flow_from_directory(data_dir+'/train', shuffle=False, target_size=target_size, batch_size=batch_size)
batches_val = gen.flow_from_directory(data_dir+'/valid', shuffle=False, target_size=target_size, batch_size=batch_size)

nb_batches = math.ceil(batches.n/batch_size)
nb_batches_val = math.ceil(batches_val.n/batch_size)

y_encode = batches.classes
y_val_encode = batches_val.classes

Found 8222 images belonging to 120 classes.
Found 2000 images belonging to 120 classes.


In [81]:
base_model = inception_v3.InceptionV3(weights='imagenet', include_top=False, pooling='avg')

In [82]:
model_i = add_preprocess(base_model, inception_v3.preprocess_input)

In [83]:
bf_i = model_i.predict_generator(batches, steps=nb_batches, verbose=1)



In [126]:
np.save(data_dir+'/results/bf_i', bf_i)

In [84]:
bf_val_i = model_i.predict_generator(batches_val, steps=nb_batches_val, verbose=1)



In [127]:
np.save(data_dir+'/results/bf_val_i', bf_val_i)

In [85]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg.fit(bf_i, y_encode)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [86]:
valid_probs = logreg.predict_proba(bf_val_i)
valid_preds = logreg.predict(bf_val_i)

In [87]:
log_loss(y_val_encode, valid_probs)

0.33937718519330201

In [88]:
accuracy_score(y_val_encode, valid_preds)

0.89549999999999996

### LogReg on all bottleneck features

In [95]:
X = np.hstack([bf_x, bf_i])
V = np.hstack([bf_val_x, bf_val_i])
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg.fit(X, y_encode)
valid_probs = logreg.predict_proba(V)
valid_preds = logreg.predict(V)

In [96]:
log_loss(y_val_encode, valid_probs)

0.28645922654116385

In [97]:
accuracy_score(y_val_encode, valid_preds)

0.90749999999999997

### predict test data

In [101]:
test_ids = [file.split('.')[0] for file in os.listdir(data_dir+'/test/unknown')]

In [102]:
test_ids[:3]

['88e129e775a1f3417785818f46bc7c2a',
 '4944813fa9c0c93048f6bac5b5cd3d49',
 '012ca7efe684c5cdfb83f35e8fbafe1b']

In [103]:
batches_test = gen.flow_from_directory(data_dir+'/test', shuffle=False, target_size=target_size, batch_size=batch_size)
batches_test.filenames[:3]

Found 10357 images belonging to 1 classes.


['unknown/88e129e775a1f3417785818f46bc7c2a.jpg',
 'unknown/4944813fa9c0c93048f6bac5b5cd3d49.jpg',
 'unknown/012ca7efe684c5cdfb83f35e8fbafe1b.jpg']

In [104]:
batches_test = gen.flow_from_directory(data_dir+'/test', shuffle=False, target_size=target_size, batch_size=batch_size)
nb_batches_test = math.ceil(batches_test.n/batch_size)

Found 10357 images belonging to 1 classes.


In [105]:
bf_x_test = model_x.predict_generator(batches_test, 
                                           steps=nb_batches_test,
                                           verbose=1)



In [130]:
np.save(data_dir+'/results/bf_x_test', bf_x_test)

In [106]:
batches_test = gen.flow_from_directory(data_dir+'/test', shuffle=False, target_size=target_size, batch_size=batch_size)
nb_batches_test = math.ceil(batches_test.n/batch_size)

Found 10357 images belonging to 1 classes.


In [107]:
bf_i_test = model_i.predict_generator(batches_test, 
                                           steps=nb_batches_test,
                                           verbose=1)



In [131]:
np.save(data_dir+'/results/bf_i_test', bf_i_test)

In [108]:
X_test = np.hstack([bf_x_test, bf_i_test])
test_probs = logreg.predict_proba(X_test)

### Make test submission file

In [111]:
subm=pd.DataFrame(np.hstack([np.array(test_ids).reshape(-1, 1), test_probs]))

In [112]:
labels = pd.read_csv(data_dir+'/labels.csv')

In [113]:
cols = ['id']+sorted(labels.breed.unique())

In [118]:
subm.columns = cols
description = 'beluga_batch_lambda_preprocess'
submission_file_name = data_dir+'/results/%s_%s.csv' % (description,
                                                        datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
                                                       )
subm.to_csv(submission_file_name, index=False)

### submit

In [124]:
!kg config -g -u $KAGGLE_USER -p $KAGGLE_PW -c $competition_name

In [125]:
!kg submit $submission_file_name -u $KAGGLE_USER -p $KAGGLE_PW -m $description

list index out of range


Your submission scored 0.30091