In [2]:
#!pip install optuna==1.5.0

In [3]:
import os
import re
import datetime
import random
import numpy as np
from PIL import Image
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split

from tensorflow.keras import models, layers
from tensorflow.keras.applications import Xception
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from google.colab import drive
drive.mount('/content/drive')


def generate_data(x, t, num=5):
  datagen = ImageDataGenerator(
    #rotation_range=180,     # randomly rotate images in the range
    zoom_range=0.2,         # randomly zoom image
    width_shift_range=0.2,  # randomly shift images horizontally
    height_shift_range=0.2, # randomly shift images vertically
    horizontal_flip=True,   # randomly flip images horizontally
    #vertical_flip=True      # randomly flip images vertically
  )

  x_list = []
  t_list = []

  x_array = np.expand_dims(x, axis=0)

  datagen.fit(x_array)

  iter = datagen.flow(x_array, batch_size=1)

  for i in range(num):
    ret = iter.next()
    #print(ret.shape, ret.dtype)
    x_list.append(ret[0])
    t_list.append(t)

  return x_list, t_list


def prepare_data(gen_num=5):
  x, t = [], []

  categorized_dir_paths = glob('/content/drive/My Drive/kikagaku/novelapp/data/images/categorized-2/*')

  for dir_path in categorized_dir_paths:
    category_id = os.path.basename(dir_path)
    image_paths = glob(os.path.join(dir_path, '*'))
    print(datetime.datetime.now().isoformat(), 'Category', category_id, ':', len(image_paths))

    for i, p in enumerate(image_paths):
      book_id = re.sub(r'(_.*$)', '', os.path.basename(p))

      if i % 30 == 0:
          print(datetime.datetime.now().isoformat(), 'Image:', i, book_id, category_id)

      try:
          img = Image.open(p)
          img_resize = img.resize((229, 229))
          img_np = np.array(img_resize) / 255.0

          x.append(img_np)
          t.append(category_id)

          x_gen, t_gen = generate_data(img_np, category_id, gen_num)
          x.extend(x_gen)
          t.extend(t_gen)

      except Exception as e:
          print(datetime.datetime.now().isoformat(), 'Error:', e)

  return x, t


def reset_seed(seed=0):
    os.environ['PYTHONHASHSEED'] = '0'
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

reset_seed(0)


#x, t = prepare_data(5)
x, t = prepare_data(3)
print(len(x), len(t))

x = np.array(x).astype('f')
t = np.array(t).astype('f')
print(x.shape, t.shape)

x_train, x_val, t_train, t_val = train_test_split(x, t, test_size=0.3, random_state=0)
print(x_train.shape, x_val.shape)

category_count = len(np.unique(t))
print(np.unique(t))
print(category_count)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2020-06-22T08:48:02.522770 Category 0 : 117
2020-06-22T08:48:02.523795 Image: 0 1015 0
2020-06-22T08:48:24.880628 Image: 30 1228 0
2020-06-22T08:48:45.471751 Image: 60 1106 0
2020-06-22T08:49:05.814294 Image: 90 1129 0
2020-06-22T08:49:24.125185 Category 1 : 119
2020-06-22T08:49:24.125884 Image: 0 3636 1
2020-06-22T08:49:46.674806 Image: 30 3722 1
2020-06-22T08:50:06.393290 Image: 60 3832 1
2020-06-22T08:50:25.808649 Image: 90 3777 1
2020-06-22T08:50:44.096612 Category 2 : 129
2020-06-22T08:50:44.097351 Image: 0 3601 2
2020-06-22T08:51:04.240250 Image: 30 3884 2
2020-06-22T08:51:24.151069 Image: 60 3778 2
2020-06-22T08:51:44.021272 Image: 90 3853 2
2020-06-22T08:52:03.433971 Image: 120 1502 2
2020-06-22T08:52:09.410742 Category 3 : 111
2020-06-22T08:52:09.411203 Image: 0 3312 3
2020-06-22T08:52:29.228416 Image: 30 2350 3
2020-06-22T08:52:49.366981 Image: 60 3

In [5]:
model_fine = Xception(include_top=False, weights='imagenet', input_shape=x_train.shape[1:])

for layer in model_fine.layers[:100]:
    layer.trainable = False

model = models.Sequential()
model.add(model_fine)

model.add(layers.Flatten())

model.add(layers.BatchNormalization())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(category_count, activation='softmax'))

optimizer = keras.optimizers.Adagrad(lr=0.005)
#optimizer = keras.optimizers.Adam(lr=0.005)
#optimizer = keras.optimizers.SGD(lr=0.005)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


In [6]:
print(datetime.datetime.now().isoformat(), 'Start')

history = model.fit(
    x_train,
    t_train,
    batch_size=32,
    epochs=5,
    verbose=2,
    validation_data=(x_val, t_val)
)

print(datetime.datetime.now().isoformat(), 'End')

2020-06-22T08:54:15.553171 Start
Epoch 1/5
42/42 - 13s - loss: 4.2581 - accuracy: 0.3333 - val_loss: 1.3186 - val_accuracy: 0.4213
Epoch 2/5
42/42 - 10s - loss: 0.9856 - accuracy: 0.6171 - val_loss: 1.2975 - val_accuracy: 0.4580
Epoch 3/5
42/42 - 10s - loss: 0.4955 - accuracy: 0.8356 - val_loss: 1.2363 - val_accuracy: 0.5210
Epoch 4/5
42/42 - 11s - loss: 0.2321 - accuracy: 0.9294 - val_loss: 1.1576 - val_accuracy: 0.5839
Epoch 5/5
42/42 - 11s - loss: 0.1057 - accuracy: 0.9775 - val_loss: 1.0660 - val_accuracy: 0.6171
2020-06-22T08:55:22.880633 End


In [7]:
history.history['val_accuracy'][-1]

0.617132842540741

In [8]:
import optuna

In [10]:
def objective(trial):
  keras.backend.clear_session()

  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)

  model_fine = Xception(include_top=False, weights='imagenet', input_shape=x_train.shape[1:])
  
  for layer in model_fine.layers[:100]:
      layer.trainable = False
  
  model = models.Sequential()
  model.add(model_fine)
  
  model.add(layers.Flatten())
  
  model.add(layers.BatchNormalization())
  model.add(layers.Dense(32, activation='relu'))
  model.add(layers.Dropout(0.1))
  model.add(layers.Dense(category_count, activation='softmax'))
  
  optimizer = keras.optimizers.Adagrad(lr=lr)
  #optimizer = keras.optimizers.Adam(lr=0.005)
  #optimizer = keras.optimizers.SGD(lr=0.005)
  
  model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

  #print(datetime.datetime.now().isoformat(), 'Start')
  
  history = model.fit(
      x_train,
      t_train,
      validation_data=(x_val, t_val),
      batch_size=32,
      epochs=10,
      verbose=0,
  )
  
  #print(datetime.datetime.now().isoformat(), 'End')

  return history.history['val_accuracy'][-1]

In [11]:
study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=10, timeout=600)
study.optimize(objective, n_trials=100, timeout=7200)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2020-06-22 09:00:13,601] Finished trial#0 with value: 0.2919580340385437 with parameters: {'lr': 0.029094288926490134}. Best is trial#0 with value: 0.2919580340385437.
[I 2020-06-22 09:02:12,355] Finished trial#1 with value: 0.25874125957489014 with parameters: {'lr': 0.029443366250492764}. Best is trial#0 with value: 0.2919580340385437.
[I 2020-06-22 09:04:13,448] Finished trial#2 with value: 0.5419580340385437 with parameters: {'lr': 0.0001571367077719973}. Best is trial#2 with value: 0.5419580340385437.
[I 2020-06-22 09:06:14,155] Finished trial#3 with value: 0.6381118893623352 with parameters: {'lr': 0.001001260929709082}. Best is trial#3 with value: 0.6381118893623352.
[I 2020-06-22 09:08:14,853] Finished trial#4 with value: 0.7115384340286255 with parameters: {'lr': 0.007453398285115892}. Best is trial#4 with value: 0.7115384340286255.
[I 2020-06-22 09:10:15,657] Finished trial#5 with value: 0.6223776340484619 with parameters: {'lr': 0.00031994142928826945}. Best is trial#4 wi

Number of finished trials: 60
Best trial:
  Value: 0.7395104765892029
  Params: 
    lr: 0.006873628666609448


In [12]:
df = study.trials_dataframe()

In [13]:
df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lr,state
0,0,0.291958,2020-06-22 08:58:14.100121,2020-06-22 09:00:13.601021,00:01:59.500900,0.029094,COMPLETE
1,1,0.258741,2020-06-22 09:00:13.612205,2020-06-22 09:02:12.354893,00:01:58.742688,0.029443,COMPLETE
2,2,0.541958,2020-06-22 09:02:12.357933,2020-06-22 09:04:13.448712,00:02:01.090779,0.000157,COMPLETE
3,3,0.638112,2020-06-22 09:04:13.451406,2020-06-22 09:06:14.155160,00:02:00.703754,0.001001,COMPLETE
4,4,0.711538,2020-06-22 09:06:14.159605,2020-06-22 09:08:14.853523,00:02:00.693918,0.007453,COMPLETE


In [15]:
df.sort_values(by='value', ascending=False).head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lr,state
52,52,0.73951,2020-06-22 10:42:34.723680,2020-06-22 10:44:35.879929,00:02:01.156249,0.006874,COMPLETE
55,55,0.729021,2020-06-22 10:48:37.772889,2020-06-22 10:50:38.792563,00:02:01.019674,0.005481,COMPLETE
44,44,0.723776,2020-06-22 10:26:30.394592,2020-06-22 10:28:31.211809,00:02:00.817217,0.006343,COMPLETE
11,11,0.723776,2020-06-22 09:20:19.687646,2020-06-22 09:22:20.322475,00:02:00.634829,0.004882,COMPLETE
42,42,0.713287,2020-06-22 10:22:29.435589,2020-06-22 10:24:30.264668,00:02:00.829079,0.004309,COMPLETE
4,4,0.711538,2020-06-22 09:06:14.159605,2020-06-22 09:08:14.853523,00:02:00.693918,0.007453,COMPLETE
37,37,0.708042,2020-06-22 10:12:27.882593,2020-06-22 10:14:28.783373,00:02:00.900780,0.007228,COMPLETE
27,27,0.708042,2020-06-22 09:52:27.282922,2020-06-22 09:54:28.023416,00:02:00.740494,0.006597,COMPLETE
6,6,0.701049,2020-06-22 09:10:15.662777,2020-06-22 09:12:16.626063,00:02:00.963286,0.00399,COMPLETE
54,54,0.695804,2020-06-22 10:46:36.675118,2020-06-22 10:48:37.768885,00:02:01.093767,0.003679,COMPLETE


In [16]:
df.sort_values(by='params_lr', ascending=False).head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lr,state
13,13,0.29021,2020-06-22 09:24:21.191148,2020-06-22 09:26:21.113263,00:01:59.922115,0.096647,COMPLETE
16,16,0.298951,2020-06-22 09:30:22.313745,2020-06-22 09:32:22.127299,00:01:59.813554,0.096502,COMPLETE
24,24,0.258741,2020-06-22 09:46:25.412743,2020-06-22 09:48:25.247046,00:01:59.834303,0.058024,COMPLETE
40,40,0.293706,2020-06-22 10:18:30.279933,2020-06-22 10:20:28.406700,00:01:58.126767,0.055775,COMPLETE
50,50,0.288462,2020-06-22 10:38:33.849009,2020-06-22 10:40:33.828252,00:01:59.979243,0.039343,COMPLETE
30,30,0.346154,2020-06-22 09:58:27.247671,2020-06-22 10:00:27.106567,00:01:59.858896,0.038346,COMPLETE
1,1,0.258741,2020-06-22 09:00:13.612205,2020-06-22 09:02:12.354893,00:01:58.742688,0.029443,COMPLETE
0,0,0.291958,2020-06-22 08:58:14.100121,2020-06-22 09:00:13.601021,00:01:59.500900,0.029094,COMPLETE
29,29,0.297203,2020-06-22 09:56:28.460512,2020-06-22 09:58:27.243487,00:01:58.782975,0.026533,COMPLETE
15,15,0.52972,2020-06-22 09:28:21.675883,2020-06-22 09:30:22.306996,00:02:00.631113,0.024069,COMPLETE


In [25]:
#df[df['params_lr'] < 0.008].head()
#df[df['params_lr'] > 0.004].head()
#df[(df['params_lr'] < 0.008) & (df['params_lr'] > 0.004)].sort_values(by='params_lr', ascending=False)
df[(df['params_lr'] < 0.008) & (df['params_lr'] > 0.004)].sort_values(by='params_lr', ascending=False).loc[:, ['value', 'params_lr']]

Unnamed: 0,value,params_lr
4,0.711538,0.007453
37,0.708042,0.007228
28,0.496504,0.007172
52,0.73951,0.006874
51,0.494755,0.006704
27,0.708042,0.006597
14,0.646853,0.006545
44,0.723776,0.006343
32,0.648601,0.005564
55,0.729021,0.005481
