In [102]:
import os
import sys
import time
import random
import sklearn
import PIL.Image

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from pprint import pprint
from copy import deepcopy

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rcParams['figure.figsize'] = (12, 12)
mpl.rcParams['axes.grid'] = False

print(sys.version_info)
for module in tf, mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)
tensorflow 2.4.0
matplotlib 3.3.3
numpy 1.19.2
pandas 1.1.5
sklearn 0.24.0
tensorflow 2.4.0
tensorflow.keras 2.4.0


In [217]:
# Prevent error：image file is truncated (# bytes not processed)
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
# GPU configurations
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.allow_soft_placement=True
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.9
session = InteractiveSession(config=config)

## Load data

In [107]:
# Rename files
# Run this cell only once or when there are new data added to the folder
models_path = "D:\\datasets\\theme-classification\\models\\"
patterns_path = "D:\\datasets\\theme-classification\\patterns\\"

def rename_files(path, cwd):
    os.chdir(path)
    for old_filename in os.listdir(path):
        new_filename = deepcopy(old_filename)
        if ' ' in old_filename:
            new_filename = '_'.join(old_filename.split())
        if "条纹" in old_filename:
            new_filename = "stride".join(new_filename.split("条纹"))
        if "格子" in old_filename:
            new_filename = "grid".join(new_filename.split("格子"))
        if "几何图形" in old_filename:
            new_filename = "geometry".join(new_filename.split("几何图形"))
        if new_filename != old_filename:
            print(new_filename)
            os.rename(old_filename, new_filename)
    os.chdir(cwd)

current_path = os.getcwd()
# Uncomment those to rename filenames
# rename_files(models_path, current_path)
# rename_files(patterns_path, current_path)

In [219]:
models_path = "D:\\datasets\\theme-classification\\models\\"
patterns_path = "D:\\datasets\\theme-classification\\patterns\\"

print("model count: ", len(os.listdir(models_path)))
print("pattern count: ", len(os.listdir(patterns_path)))

models_filelist = os.listdir(models_path)
patterns_filelist = os.listdir(patterns_path)

def fullpath(filelist, folderpath):
    return [folderpath + filename for filename in filelist]

models_filelist = fullpath(models_filelist, models_path)
patterns_filelist = fullpath(patterns_filelist, patterns_path)

models_count = len(models_filelist)
patterns_count = len(patterns_filelist)

# # Shuffle the datasets
random.seed(1)
random.shuffle(models_filelist)
random.shuffle(patterns_filelist)

# Generate labels
def generate_labels(filelist, label):
    return [(filename, label) for filename in filelist]

models_filelist = generate_labels(models_filelist, "model")
patterns_filelist = generate_labels(patterns_filelist, "pattern")

# Construct training set and validation set
training_filelist = models_filelist[:int(models_count*0.9)] + patterns_filelist[:int(patterns_count*0.9)]
validation_filelist = models_filelist[int(models_count*0.9):] + patterns_filelist[int(patterns_count*0.9):]
random.shuffle(training_filelist)
random.shuffle(validation_filelist)

# # Construct dataframes from data
# models_df = pd.DataFrame(models_filelist)
# patterns_df = pd.DataFrame(patterns_filelist)
# models_df.columns = ["filepath", "class"]
# patterns_df.columns = ["filepath", "class"]

training_df = pd.DataFrame(training_filelist)
validation_df = pd.DataFrame(validation_filelist)
training_df.columns = ["filepath", "class"]
validation_df.columns = ["filepath", "class"]

print("training set size: ", len(training_filelist))
print("validation set size: ", len(validation_filelist))

model count:  6336
pattern count:  2003
training set size:  7504
validation set size:  835


In [220]:
training_df.head()

Unnamed: 0,filepath,class
0,D:\datasets\theme-classification\models\PARR-W...,model
1,D:\datasets\theme-classification\models\160897...,model
2,D:\datasets\theme-classification\patterns\1854...,pattern
3,D:\datasets\theme-classification\models\160878...,model
4,D:\datasets\theme-classification\models\160913...,model


## Data preprocessing

In [224]:
height = 224
width = 224
channels = 3
batch_size = 32
num_classes = 1
class_names = ['model', 'pattern']

train_datagen = keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function = keras.applications.resnet50.preprocess_input,
    rotation_range = 40,
    width_shift_range = 0.2,   # percentage
    height_shift_range = 0.2,  # percentage
    shear_range = 0.2,         # shear strength
    zoom_range = 0.2,          # zoom strength
    horizontal_flip = True,
    fill_mode = "wrap"
)
train_generator = train_datagen.flow_from_dataframe(
    training_df,
    x_col = 'filepath',
    y_col = 'class',
    classes = class_names,
    target_size = (height, width),
    batch_size = batch_size,
    seed = 7,
    shuffle = True,
    class_mode = 'sparse',
    validate_filenames = False,
)

valid_datagen = keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function = keras.applications.resnet50.preprocess_input
)
valid_generator = valid_datagen.flow_from_dataframe(
    validation_df,
    x_col = 'filepath',
    y_col = 'class',
    classes = class_names,
    target_size = (height, width),
    batch_size = batch_size,
    seed = 7,
    shuffle = False,
    class_mode = 'sparse',
    validate_filenames = False,
)

train_num = train_generator.samples
valid_num = valid_generator.samples
print(train_num, valid_num)

Found 7504 non-validated image filenames belonging to 2 classes.
Found 835 non-validated image filenames belonging to 2 classes.
7504 835


In [227]:
train_num // batch_size

234

In [228]:
for i in range(1):
    x, y = train_generator.next()
    print(x.shape, y.shape)
    print(y)

(32, 224, 224, 3) (32,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 1. 0. 1. 0. 0.]


## Modeling with ResNet-50

In [229]:
resnet50_fine_tune = keras.models.Sequential()
resnet50_fine_tune.add(keras.applications.ResNet50(include_top=False, weights='imagenet', pooling='avg'))
resnet50_fine_tune.add(keras.layers.Dense(num_classes, activation='sigmoid'))
resnet50_fine_tune.layers[0].trainable = False  # set ResNet (regarded as one layer)

resnet50_fine_tune.compile(loss="binary_crossentropy",
                           optimizer="adam",
                           metrics=["accuracy"])
resnet50_fine_tune.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Functional)        (None, 2048)              23587712  
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 2049      
Total params: 23,589,761
Trainable params: 2,049
Non-trainable params: 23,587,712
_________________________________________________________________


In [230]:
epochs = 1  # 10 for fine tune!
history = resnet50_fine_tune.fit(
    train_generator, 
    steps_per_epoch = train_num // batch_size,
    epochs = epochs,
    validation_data = valid_generator,
    validation_steps = valid_num // batch_size
)



In [233]:
x.shape

(32, 224, 224, 3)

In [249]:
y_pred = [1. if out >= 0.5 else 0. for out in resnet50_fine_tune.predict_on_batch(x)]

In [250]:
y == y_pred

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [286]:
x, y = valid_generator.next()

In [287]:
y

array([1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [288]:
y_pred = [1. if out >= 0.5 else 0. for out in resnet50_fine_tune.predict_on_batch(x)]

In [289]:
y_pred == y

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [292]:
if not os.path.exists("./saved-models"):
    os.mkdir("./saved-models")

resnet50_fine_tune.save(filepath='./saved-models/theme-classi-resnet50.tf',
                        include_optimizer=True,
                        save_format = 'tf')

INFO:tensorflow:Assets written to: ./saved-models/theme-classi-resnet50.tf\assets
