#### Creating File DataFrame

- 데이터 출처
    - https://open.selectstar.ai/ko/?page_id=5976
    - https://aihub.or.kr/aihubdata/data/view.do?currMenu=&topMenu=&aihubDataSe=realm&dataSetSn=242
    - https://aihub.or.kr/aihubdata/data/view.do?currMenu=&topMenu=&aihubDataSe=realm&dataSetSn=79

In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from natsort import natsorted
from pathlib import Path

In [2]:
BASE_PATH = '../data'

dir_path = Path(BASE_PATH)

In [3]:
food1_ps = list(dir_path.joinpath('food1').rglob("*.png"))
food1_paths = []

for i in food1_ps:
    if i.stem.startswith(('._', 'BBQ', 'bingsu', 'cake', 'chicken', 'coffee_hot', 'coffee_ice', 'galbi', 'serial', 'soup', 'pasta', 'chicken')) == True:
        pass
    else:
        food1_paths.append(i)
        
food1_labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], food1_paths))

print(len(food1_paths), len(food1_labels))

84748 84748


In [4]:
food2_cate = []

for (root, directories, files) in os.walk(dir_path.joinpath('food2')):
    for d in directories:
        category = os.path.join(root, d).split('\\')[-1]
        food2_cate.append(category)

len(food2_cate)

130

In [5]:
food2_paths = []

for category in food2_cate:
    if len(list(dir_path.joinpath(f'food2/{category}').rglob("*.jpg"))) > 1300:
        random_elements = random.sample(list(dir_path.joinpath(f'food2/{category}').rglob("*.jpg")), 1300)
        food2_paths.extend(random_elements)
    else :
        small = list(dir_path.joinpath(f'food2/{category}').rglob("*.jpg"))
        food2_paths.extend(small)

len(food2_paths)

143574

In [6]:
food2_labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], food2_paths))

len(food2_labels)

143574

In [7]:
food3_paths = list(dir_path.joinpath('food3').rglob("*.jpg"))

food3_png = list(dir_path.joinpath('food3').rglob("*.png"))

print(len(food3_paths), len(food3_png))

40261 39


In [8]:
food3_labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], food3_paths))

len(food3_labels)

40261

In [9]:
all_image_paths = food1_paths + food2_paths + food3_paths
all_labels = food1_labels + food2_labels + food3_labels

print(len(all_image_paths), len(all_labels))

268583 268583


In [10]:
image_series = pd.Series(all_image_paths, name='imagepath').astype(str)
labels_series = pd.Series(all_labels, name='label')

images_df = pd.concat([image_series, labels_series], axis=1)

In [11]:
images_df.head()

Unnamed: 0,imagepath,label
0,..\data\food1\baek_sook\png\baek_sook_0001.png,baek_sook
1,..\data\food1\baek_sook\png\baek_sook_0002.png,baek_sook
2,..\data\food1\baek_sook\png\baek_sook_0003.png,baek_sook
3,..\data\food1\baek_sook\png\baek_sook_0004.png,baek_sook
4,..\data\food1\baek_sook\png\baek_sook_0005.png,baek_sook


In [12]:
images_df['label'].value_counts()

label
caesar_salad            1300
toast                   1300
dumpling                1300
fish_cutlet             1300
fried_chicken           1300
                        ... 
chilled_jokbal_salad     312
sundaegukbap             305
mushroom                 248
fried_food               223
tteokgalbi               164
Name: count, Length: 253, dtype: int64

In [13]:
category_samples = []
for category in images_df['label'].unique():
    category_slice = images_df.query("label == @category")
    category_samples.append(category_slice.sample(100, random_state=42))
sample_df = pd.concat(category_samples, axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [14]:
sample_df.head()

Unnamed: 0,imagepath,label
0,..\data\food2\walnut_pie\A020525XX_00141.jpg,walnut_pie
1,..\data\food1\chocolate\png\chocolate_0617.png,chocolate
2,..\data\food1\tortilla\png\tortilla_0287.png,tortilla
3,..\data\food1\croissant\png\croissant_0817.png,croissant
4,..\data\food1\lasana\png\lasagna_0957.png,lasana


In [15]:
sample_df['label'].value_counts()

label
walnut_pie              100
potato_pancake          100
chicken_nuggets         100
chicken_breast_salad    100
sea_cucumber            100
                       ... 
egg_benedict            100
dubujorim               100
dakbal                  100
tomato_salad            100
madeleine               100
Name: count, Length: 253, dtype: int64

#### Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
sample_train_df, sample_test_df = train_test_split(sample_df, test_size=0.2, shuffle=True, random_state=42)

#### Creating Generators

In [18]:
import tensorflow as tf

In [19]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
    validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
)

In [20]:
sample_train_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='training'
)

sample_val_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='validation'
)

sample_test_images = test_generator.flow_from_dataframe(
    dataframe=sample_test_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

Found 16192 validated image filenames belonging to 253 classes.
Found 4048 validated image filenames belonging to 253 classes.
Found 5060 validated image filenames belonging to 253 classes.


#### Modeling