#### Creating File DataFrame

- 데이터 출처
    - https://open.selectstar.ai/ko/?page_id=5976
    - https://aihub.or.kr/aihubdata/data/view.do?currMenu=&topMenu=&aihubDataSe=realm&dataSetSn=242
    - https://aihub.or.kr/aihubdata/data/view.do?currMenu=&topMenu=&aihubDataSe=realm&dataSetSn=79

In [66]:
import pandas as pd
import numpy as np
import os
import json
import random
from natsort import natsorted
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
BASE_PATH = '../data'

dir_path = Path(BASE_PATH)

In [3]:
food1_ps = list(dir_path.joinpath('food1').rglob("*.png"))
food1_paths = []

for i in food1_ps:
    if i.stem.startswith(('._', 'BBQ', 'bingsu', 'cake', 'chicken', 'coffee_hot', 'coffee_ice', 'galbi', 'serial', 'soup', 'pasta', 'chicken')) == True:
        pass
    else:
        food1_paths.append(i)
        
food1_labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], food1_paths))

print(len(food1_paths), len(food1_labels))

84748 84748


In [4]:
food2_cate = []

for (root, directories, files) in os.walk(dir_path.joinpath('food2')):
    for d in directories:
        category = os.path.join(root, d).split('\\')[-1]
        food2_cate.append(category)

len(food2_cate)

130

In [5]:
food2_paths = []

for category in food2_cate:
    if len(list(dir_path.joinpath(f'food2/{category}').rglob("*.jpg"))) > 1300:
        random_elements = random.sample(list(dir_path.joinpath(f'food2/{category}').rglob("*.jpg")), 1300)
        food2_paths.extend(random_elements)
    else :
        small = list(dir_path.joinpath(f'food2/{category}').rglob("*.jpg"))
        food2_paths.extend(small)

food2_labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], food2_paths))

print(len(food2_paths), len(food2_labels))

143574 143574


In [6]:
food3_paths = list(dir_path.joinpath('food3').rglob("*.jpg"))

food3_png = list(dir_path.joinpath('food3').rglob("*.png"))

print(len(food3_paths), len(food3_png))

40261 39


In [7]:
food3_labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], food3_paths))

len(food3_labels)

40261

In [8]:
all_image_paths = food1_paths + food2_paths + food3_paths
all_labels = food1_labels + food2_labels + food3_labels

print(len(all_image_paths), len(all_labels))

268583 268583


In [48]:
image_series = pd.Series(all_image_paths, name='imagepath').astype(str)
labels_series = pd.Series(all_labels, name='label')

images_df = pd.concat([image_series, labels_series], axis=1)

In [49]:
images_df.head()

Unnamed: 0,imagepath,label
0,..\data\food1\baek_sook\png\baek_sook_0001.png,baek_sook
1,..\data\food1\baek_sook\png\baek_sook_0002.png,baek_sook
2,..\data\food1\baek_sook\png\baek_sook_0003.png,baek_sook
3,..\data\food1\baek_sook\png\baek_sook_0004.png,baek_sook
4,..\data\food1\baek_sook\png\baek_sook_0005.png,baek_sook


In [50]:
images_df['label'].value_counts()

label
caesar_salad            1300
toast                   1300
dumpling                1300
fish_cutlet             1300
fried_chicken           1300
                        ... 
chilled_jokbal_salad     312
sundaegukbap             305
mushroom                 248
fried_food               223
tteokgalbi               164
Name: count, Length: 253, dtype: int64

In [37]:
# category = list(images_df['label'].unique())

# labels_id = {string:idx for idx, string in enumerate (category)}

# for category in category:
#     images_df['label'] = images_df['label'].replace(category, labels_id[category])

# images_df['label']


{'baek_sook': 0,
 'baguette': 1,
 'banh_mi': 2,
 'beef_tartare': 3,
 'bibimbap': 4,
 'bulgogi': 5,
 'bunza': 6,
 'burger': 7,
 'burrito': 8,
 'cannoli': 9,
 'caprese': 10,
 'chili_crab': 11,
 'chocolate': 12,
 'churros': 13,
 'cookie': 14,
 'crepe': 15,
 'croissant': 16,
 'croque_monsieur': 17,
 'curry': 18,
 'dacquoise': 19,
 'dim_sum': 20,
 'donut': 21,
 'egg_benedict': 22,
 'egg_tart': 23,
 'escargot': 24,
 'fish_and_chips': 25,
 'fondue': 26,
 'french_fries': 27,
 'french_toast': 28,
 'gelato': 29,
 'gimbap': 30,
 'gratin': 31,
 'hot_dog': 32,
 'hot_pot': 33,
 'jajangmyeon': 34,
 'japchae': 35,
 'kaya_toast': 36,
 'kebap': 37,
 'kimchi_stew': 38,
 'korean_pancake': 39,
 'lasana': 40,
 'lobster': 41,
 'macaroon': 42,
 'madeleine': 43,
 'mapa_tofu': 44,
 'milfeuille': 45,
 'muffin': 46,
 'naan': 47,
 'nachos': 48,
 'nasi_goreng': 49,
 'omelet': 50,
 'onigiri': 51,
 'pad_thai': 52,
 'paea': 53,
 'pan_cake': 54,
 'pie': 55,
 'pizza': 56,
 'popcorn': 57,
 'pork_chop': 58,
 'pound_cake':

#### Setting

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, BatchNormalization, ReLU, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential

import tensorflow as tf

from keras.models import load_model

In [13]:
np.random.seed(42)
tf.random.set_seed(42)

In [14]:
# GPU 사용여부 확인하기
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices()) # print avaliable CPU, GPU list
print("───────────────────────────────────────")
print("am I using GPU?: ", bool(tf.config.list_physical_devices('GPU'))) # NVIDIA Quadro RTX 5000

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8038629424490720911
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 13059315712
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7516719456982811015
physical_device_desc: "device: 0, name: Quadro RTX 5000, pci bus id: 0000:03:00.0, compute capability: 7.5"
xla_global_id: 416903419
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 13059315712
locality {
  bus_id: 1
  links {
  }
}
incarnation: 499915547328934630
physical_device_desc: "device: 1, name: Quadro RTX 5000, pci bus id: 0000:81:00.0, compute capability: 7.5"
xla_global_id: 2144165316
]
───────────────────────────────────────
am I using GPU?:  True


#### Sample Data Test

In [52]:
category_samples = []
for category in images_df['label'].unique():
    category_slice = images_df.query("label == @category")
    category_samples.append(category_slice.sample(100, random_state=42))
sample_df = pd.concat(category_samples, axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [53]:
sample_df.head()

Unnamed: 0,imagepath,label
0,..\data\food2\walnut_pie\A020525XX_30208.jpg,walnut_pie
1,..\data\food1\chocolate\png\chocolate_0617.png,chocolate
2,..\data\food1\tortilla\png\tortilla_0287.png,tortilla
3,..\data\food1\croissant\png\croissant_0817.png,croissant
4,..\data\food1\lasana\png\lasagna_0957.png,lasana


In [54]:
sample_df['label'].value_counts()

label
walnut_pie              100
potato_pancake          100
chicken_nuggets         100
chicken_breast_salad    100
sea_cucumber            100
                       ... 
egg_benedict            100
dubujorim               100
dakbal                  100
tomato_salad            100
madeleine               100
Name: count, Length: 253, dtype: int64

##### Train-Test Split

In [55]:
sample_train_df, sample_test_df = train_test_split(sample_df, test_size=0.2, shuffle=True, random_state=42)

##### Creating Generators

In [51]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.vgg16.preprocess_input,
    validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.vgg16.preprocess_input
)

In [56]:
sample_train_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='training'
)

sample_val_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='validation'
)

sample_test_images = test_generator.flow_from_dataframe(
    dataframe=sample_test_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

Found 16192 validated image filenames belonging to 253 classes.
Found 4048 validated image filenames belonging to 253 classes.
Found 5060 validated image filenames belonging to 253 classes.


##### Modeling

In [21]:
pretrained_model = VGG16(
    input_shape=(224, 224, 3),
    include_top=False,
    weights='imagenet'
)

pretrained_model.trainable = False

In [22]:
inputs = pretrained_model.input

x = Dense(128, activation='relu')(pretrained_model.output)
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)

outputs = Dense(253, activation='softmax')(x)

sample_model = tf.keras.Model(inputs, outputs)


print(sample_model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

##### Training

In [57]:
sample_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

sample_history = sample_model.fit(
    sample_train_images,
    batch_size=128,
    validation_data=sample_val_images,
    epochs=5,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


##### Results

In [60]:
sample_results = sample_model.evaluate(sample_test_images, verbose=0)
print("Sample Test Accuracy: {:.2f}%".format(sample_results[1] * 100))

Sample Test Accuracy: 43.44%


In [63]:
sample_predictions = np.argmax(sample_model.predict(sample_test_images), axis=1)

sample_cm = confusion_matrix(sample_test_images.labels, sample_predictions)
sample_clr = classification_report(sample_test_images.labels, sample_predictions, target_names=sample_test_images.class_indices, zero_division=0)

In [68]:
print("Classification Report:\n----------------------\n", sample_clr)

Classification Report:
----------------------
                           precision    recall  f1-score   support

                 abalone       0.88      0.78      0.82        27
               agwi_jjim       0.43      0.73      0.54        22
                  almond       0.71      0.91      0.80        22
                   bacon       0.62      0.33      0.43        24
               baek_sook       0.67      0.50      0.57        20
                baguette       0.20      0.24      0.22        21
                 banh_mi       0.50      0.43      0.47        23
       bean_sprout_salad       0.75      0.67      0.71        18
        bean_sprout_soup       0.53      0.42      0.47        19
              beef_jerky       0.41      0.75      0.53        20
            beef_tartare       0.45      0.39      0.42        23
                bibimbap       0.56      0.38      0.45        24
                  bossam       0.56      0.24      0.33        21
             brazil_nuts    

##### Save Model

In [76]:
sample_model.save('../models/sample_vgg16.h5')