#### Creating File DataFrame

- 데이터 출처
    - https://open.selectstar.ai/ko/?page_id=5976
    - https://aihub.or.kr/aihubdata/data/view.do?currMenu=&topMenu=&aihubDataSe=realm&dataSetSn=242

In [1]:
import pandas as pd
import numpy as np
import os
import json
from natsort import natsorted
from pathlib import Path

In [2]:
BASE_PATH = '../data/food'

dir_path = Path(BASE_PATH)

In [3]:
image_ps = list(dir_path.rglob("*.png"))
image_paths = []

for i in image_ps:
    if i.stem.startswith(('._', 'BBQ', 'bingsu', 'cake', 'coffee_hot', 'coffee_ice', 'gelato', 'serial', 'soup', 'pasta')) == True:
        pass
    else:
        image_paths.append(i)
        
labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], image_paths))

print(len(image_paths), len(labels))

85747 85747


In [4]:
cake_image_ps = list(dir_path.joinpath('cake').rglob("*.png"))
pasta_image_ps = list(dir_path.joinpath('pasta').rglob("*.png"))
cake_image_paths = []
pasta_image_paths = []

for i in cake_image_ps:
    if i.stem.startswith('._') == True:
        pass
    else:
        cake_image_paths.append(i)

for i in pasta_image_ps:
    if i.stem.startswith('._') == True:
        pass
    else:
        pasta_image_paths.append(i)

print(len(cake_image_paths), len(pasta_image_paths))

1000 1004


In [5]:
cake_json_ps = list(dir_path.joinpath('cake').rglob("*.json"))
pasta_json_ps = list(dir_path.joinpath('pasta').rglob("*.json"))

cake_json_paths = natsorted(cake_json_ps, key=str)
pasta_json_paths = natsorted(pasta_json_ps, key=str)

In [6]:
cake_json_list = []
pasta_json_list = []

for i in cake_json_paths:
    with i.open(encoding='utf-8-sig') as f:
        cake_data = json.load(f)
        cake_json_list.append(cake_data)

for i in pasta_json_paths:
    with i.open(encoding='utf-8-sig') as f:
        pasta_data = json.load(f)
        pasta_json_list.append(pasta_data)

In [7]:
cake_labels = []

for i in range(len(cake_json_list)):
    for j in range(len(cake_json_list[i]['ingredients'])):
        if cake_json_list[i]['ingredients'][j]['subtype'] == 'dairy_product_egg':
            if cake_json_list[i]['ingredients'][j]['ingredient'] == 'butter':
                cake_labels.append('cake')
            elif cake_json_list[i]['ingredients'][j]['ingredient'] == 'ice_cream':
                cake_labels.append('cake')
            else:
                cake_name = cake_json_list[i]['ingredients'][j]['ingredient'] + '_cake'
                cake_labels.append(cake_name)
            break
        else:
            pass
        
    if len(cake_labels) != i+1:
        cake_labels.append('cake')
    else:
        pass
        
print(len(cake_labels))

1000


In [8]:
pasta_labels = []

for i in range(len(pasta_json_list)):
    for j in range(len(pasta_json_list[i]['ingredients'])):
        if pasta_json_list[i]['ingredients'][j]['subtype'] == 'soup_sauce_spice':
            if pasta_json_list[i]['ingredients'][j]['ingredient'] == 'tomato_sauce':
                pasta_labels.append('tomato_sauce_pasta')
            elif pasta_json_list[i]['ingredients'][j]['ingredient'] == 'cream_sauce':
                pasta_labels.append('cream_sauce_pasta')
            elif pasta_json_list[i]['ingredients'][j]['ingredient'] == 'oil':
                pasta_labels.append('oil_pasta')
            elif pasta_json_list[i]['ingredients'][j]['ingredient'] == 'ketchup':
                pasta_labels.append('tomato_sauce_pasta')
            else:
                pasta_labels.append('pasta')
            break
        else:
            pass
        
    if len(pasta_labels) != i+1:
        pasta_labels.append('pasta')
    else:
        pass
        
print(len(pasta_labels))

1004


In [9]:
all_image_paths = image_paths + cake_image_paths + pasta_image_paths
all_labels = labels + cake_labels + pasta_labels

print(len(all_image_paths), len(all_labels))

87751 87751


In [10]:
image_series = pd.Series(all_image_paths, name='imagepath').astype(str)
labels_series = pd.Series(all_labels, name='label')

images_df = pd.concat([image_series, labels_series], axis=1)

In [11]:
images_df.head()

Unnamed: 0,imagepath,label
0,..\data\food\baek_sook\png\baek_sook_0001.png,baek_sook
1,..\data\food\baek_sook\png\baek_sook_0002.png,baek_sook
2,..\data\food\baek_sook\png\baek_sook_0003.png,baek_sook
3,..\data\food\baek_sook\png\baek_sook_0004.png,baek_sook
4,..\data\food\baek_sook\png\baek_sook_0005.png,baek_sook


In [12]:
images_df['label'].value_counts()

label
ratatouille           1202
dim_sum               1162
chocolate             1111
takoyaki              1100
pork_chop             1098
                      ... 
tomato_sauce_pasta     240
chocolate_cake         228
oil_pasta              161
cheese_cake             41
cake                    35
Name: count, Length: 92, dtype: int64

In [24]:
category_samples = []
for category in images_df['label'].unique():
    category_slice = images_df.query("label == @category")
    category_samples.append(category_slice.sample(100, random_state=42, replace=True))
sample_df = pd.concat(category_samples, axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [25]:
sample_df.head()

Unnamed: 0,imagepath,label
0,..\data\food\risotto\png\risotto_0072.png,risotto
1,..\data\food\lobster\png\lobster_1014.png,lobster
2,..\data\food\egg_tart\png\egg_tart_0021.png,egg_tart
3,..\data\food\muffin\png\muffin_0103.png,muffin
4,..\data\food\muffin\png\muffin_0662.png,muffin


In [26]:
sample_df['label'].value_counts()

label
risotto        100
dacquoise      100
wolnam_ssam    100
cookie         100
kimchi_stew    100
              ... 
pound_cake     100
macaroon       100
burger         100
salad          100
nachos         100
Name: count, Length: 92, dtype: int64

#### Train-Test Split

In [22]:
from sklearn.model_selection import train_test_split

In [27]:
sample_train_df, sample_test_df = train_test_split(sample_df, test_size=0.2, shuffle=True, random_state=42)

#### Creating Generators

In [29]:
import tensorflow as tf

In [30]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
    validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
)

In [31]:
sample_train_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='training'
)

sample_val_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='validation'
)

sample_test_images = test_generator.flow_from_dataframe(
    dataframe=sample_test_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

Found 5888 validated image filenames belonging to 92 classes.
Found 1472 validated image filenames belonging to 92 classes.
Found 1840 validated image filenames belonging to 92 classes.


#### Modeling