In [2]:
#!pip install optuna==1.5.0

In [3]:
import os
import re
import datetime
import random
import numpy as np
from PIL import Image
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split

from tensorflow.keras import models, layers
from tensorflow.keras.applications import Xception
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from google.colab import drive
drive.mount('/content/drive')


def generate_data(x, t, num=5):
  datagen = ImageDataGenerator(
    #rotation_range=180,     # randomly rotate images in the range
    zoom_range=0.2,         # randomly zoom image
    width_shift_range=0.2,  # randomly shift images horizontally
    height_shift_range=0.2, # randomly shift images vertically
    horizontal_flip=True,   # randomly flip images horizontally
    #vertical_flip=True      # randomly flip images vertically
  )

  x_list = []
  t_list = []

  x_array = np.expand_dims(x, axis=0)

  datagen.fit(x_array)

  iter = datagen.flow(x_array, batch_size=1)

  for i in range(num):
    ret = iter.next()
    #print(ret.shape, ret.dtype)
    x_list.append(ret[0])
    t_list.append(t)

  return x_list, t_list


def prepare_data(gen_num=5):
  x, t = [], []

  categorized_dir_paths = glob('/content/drive/My Drive/kikagaku/novelapp/data/images/categorized-2/*')

  for dir_path in categorized_dir_paths:
    category_id = os.path.basename(dir_path)
    image_paths = glob(os.path.join(dir_path, '*'))
    print(datetime.datetime.now().isoformat(), 'Category', category_id, ':', len(image_paths))

    for i, p in enumerate(image_paths):
      book_id = re.sub(r'(_.*$)', '', os.path.basename(p))

      if i % 30 == 0:
          print(datetime.datetime.now().isoformat(), 'Image:', i, book_id, category_id)

      try:
          img = Image.open(p)
          img_resize = img.resize((229, 229))
          img_np = np.array(img_resize) / 255.0

          x.append(img_np)
          t.append(category_id)

          x_gen, t_gen = generate_data(img_np, category_id, gen_num)
          x.extend(x_gen)
          t.extend(t_gen)

      except Exception as e:
          print(datetime.datetime.now().isoformat(), 'Error:', e)

  return x, t


def reset_seed(seed=0):
    os.environ['PYTHONHASHSEED'] = '0'
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

reset_seed(0)


x, t = prepare_data(5)
print(len(x), len(t))

x = np.array(x).astype('f')
t = np.array(t).astype('f')
print(x.shape, t.shape)

x_train, x_val, t_train, t_val = train_test_split(x, t, test_size=0.3, random_state=0)
print(x_train.shape, x_val.shape)

category_count = len(np.unique(t))
print(np.unique(t))
print(category_count)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2020-06-21T13:16:50.329817 Category 0 : 117
2020-06-21T13:16:50.330372 Image: 0 1015 0
2020-06-21T13:17:12.404772 Image: 30 1228 0
2020-06-21T13:17:33.841484 Image: 60 1106 0
2020-06-21T13:17:56.716844 Image: 90 1129 0
2020-06-21T13:18:15.433132 Category 1 : 119
2020-06-21T13:18:15.433483 Image: 0 3636 1
2020-06-21T13:18:36.116214 Image: 30 3722 1
2020-06-21T13:18:57.205021 Image: 60 3832 1
2020-06-21T13:19:17.573448 Image: 90 3777 1
2020-06-21T13:19:36.966055 Category 2 : 129
2020-06-21T13:19:36.966916 Image: 0 3601 2
2020-06-21T13:19:57.914049 Image: 30 3884 2
2020-06-21T13:20:18.639063 Image: 60 3778 2
2020-06-21T13:20:39.474088 Image: 90 3853 2
2020-06-21T13:20:59.264099 Image: 120 1502 2
2020-06-21T13:21:05.520628 Category 3 : 111
2020-06-21T13:21:05.521525 Image: 0 3312 3
2020-06-21T13:21:25.625946 Image: 30 2350 3
2020-06-21T13:21:46.063510 Image: 60 3

In [4]:
model_fine = Xception(include_top=False, weights='imagenet', input_shape=x_train.shape[1:])

for layer in model_fine.layers[:100]:
    layer.trainable = False

model = models.Sequential()
model.add(model_fine)

model.add(layers.Flatten())

model.add(layers.BatchNormalization())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(category_count, activation='softmax'))

optimizer = keras.optimizers.Adagrad(lr=0.005)
#optimizer = keras.optimizers.Adam(lr=0.005)
#optimizer = keras.optimizers.SGD(lr=0.005)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


In [5]:
print(datetime.datetime.now().isoformat(), 'Start')

history = model.fit(
    x_train,
    t_train,
    batch_size=32,
    epochs=5,
    verbose=2,
    validation_data=(x_val, t_val)
)

print(datetime.datetime.now().isoformat(), 'End')

2020-06-21T13:22:58.785409 Start
Epoch 1/5
63/63 - 36s - loss: 2.2896 - accuracy: 0.4662 - val_loss: 1.2559 - val_accuracy: 0.5519
Epoch 2/5
63/63 - 34s - loss: 0.4480 - accuracy: 0.8479 - val_loss: 1.1821 - val_accuracy: 0.6359
Epoch 3/5
63/63 - 34s - loss: 0.1372 - accuracy: 0.9700 - val_loss: 1.0613 - val_accuracy: 0.7036
Epoch 4/5
63/63 - 34s - loss: 0.0659 - accuracy: 0.9870 - val_loss: 0.9122 - val_accuracy: 0.7491
Epoch 5/5
63/63 - 34s - loss: 0.0399 - accuracy: 0.9920 - val_loss: 0.7716 - val_accuracy: 0.7596
2020-06-21T13:26:05.182895 End


In [7]:
score = model.evaluate(x_val, t_val, verbose=1)
score



[0.7716102004051208, 0.7596266269683838]

In [10]:
history.history['val_accuracy'][-1]

0.7596266269683838

In [13]:
import optuna

In [14]:
def objective(trial):
  keras.backend.clear_session()

  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)

  model_fine = Xception(include_top=False, weights='imagenet', input_shape=x_train.shape[1:])
  
  for layer in model_fine.layers[:100]:
      layer.trainable = False
  
  model = models.Sequential()
  model.add(model_fine)
  
  model.add(layers.Flatten())
  
  model.add(layers.BatchNormalization())
  model.add(layers.Dense(32, activation='relu'))
  model.add(layers.Dropout(0.1))
  model.add(layers.Dense(category_count, activation='softmax'))
  
  optimizer = keras.optimizers.Adagrad(lr=lr)
  #optimizer = keras.optimizers.Adam(lr=0.005)
  #optimizer = keras.optimizers.SGD(lr=0.005)
  
  model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

  print(datetime.datetime.now().isoformat(), 'Start')
  
  history = model.fit(
      x_train,
      t_train,
      validation_data=(x_val, t_val),
      batch_size=32,
      epochs=5,
      verbose=0,
  )
  
  print(datetime.datetime.now().isoformat(), 'End')

  return history.history['val_accuracy'][-1]

In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, timeout=600)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

2020-06-21T13:37:54.229207 Start
2020-06-21T13:40:49.334078 End


[I 2020-06-21 13:40:49,635] Finished trial#0 with value: 0.48658108711242676 with parameters: {'lr': 6.274820530461714e-05}. Best is trial#0 with value: 0.48658108711242676.


2020-06-21T13:40:53.912834 Start
2020-06-21T13:43:49.721654 End


[I 2020-06-21 13:43:50,024] Finished trial#1 with value: 0.7176195979118347 with parameters: {'lr': 0.0017737890673030204}. Best is trial#1 with value: 0.7176195979118347.


2020-06-21T13:43:54.148913 Start
2020-06-21T13:46:53.023219 End


[I 2020-06-21 13:46:53,331] Finished trial#2 with value: 0.4877479672431946 with parameters: {'lr': 9.396554596561431e-05}. Best is trial#1 with value: 0.7176195979118347.


2020-06-21T13:46:57.583263 Start
2020-06-21T13:49:54.555761 End


[I 2020-06-21 13:49:54,849] Finished trial#3 with value: 0.25437572598457336 with parameters: {'lr': 0.022888718067637624}. Best is trial#1 with value: 0.7176195979118347.


Number of finished trials: 4
Best trial:
  Value: 0.7176195979118347
  Params: 
    lr: 0.0017737890673030204
