#### Creating File DataFrame

- 데이터 출처
    - https://open.selectstar.ai/ko/?page_id=5976

In [None]:
import pandas as pd
import numpy as np
import os
import json
from natsort import natsorted
from pathlib import Path

In [None]:
BASE_PATH = '../data/food1'

dir_path = Path(BASE_PATH)

In [None]:
image_ps = list(dir_path.rglob("*.png"))
image_paths = []

for i in image_ps:
    if i.stem.startswith(('._', 'BBQ', 'bingsu', 'cake', 'coffee_hot', 'coffee_ice', 'serial', 'soup', 'pasta')) == True:
        pass
    else:
        image_paths.append(i)
        
labels = list(map(lambda x: os.path.split(x)[0].split('\\')[3], image_paths))

print(len(image_paths), len(labels))

In [None]:
cake_image_ps = list(dir_path.joinpath('cake').rglob("*.png"))
pasta_image_ps = list(dir_path.joinpath('pasta').rglob("*.png"))
cake_image_paths = []
pasta_image_paths = []

for i in cake_image_ps:
    if i.stem.startswith('._') == True:
        pass
    else:
        cake_image_paths.append(i)

for i in pasta_image_ps:
    if i.stem.startswith('._') == True:
        pass
    else:
        pasta_image_paths.append(i)

print(len(cake_image_paths), len(pasta_image_paths))

In [None]:
cake_json_ps = list(dir_path.joinpath('cake').rglob("*.json"))
pasta_json_ps = list(dir_path.joinpath('pasta').rglob("*.json"))

cake_json_paths = natsorted(cake_json_ps, key=str)
pasta_json_paths = natsorted(pasta_json_ps, key=str)

In [None]:
cake_json_list = []
pasta_json_list = []

for i in cake_json_paths:
    with i.open(encoding='utf-8-sig') as f:
        cake_data = json.load(f)
        cake_json_list.append(cake_data)

for i in pasta_json_paths:
    with i.open(encoding='utf-8-sig') as f:
        pasta_data = json.load(f)
        pasta_json_list.append(pasta_data)

In [None]:
cake_labels = []

for i in range(len(cake_json_list)):
    for j in range(len(cake_json_list[i]['ingredients'])):
        if cake_json_list[i]['ingredients'][j]['subtype'] == 'dairy_product_egg':
            if cake_json_list[i]['ingredients'][j]['ingredient'] == 'butter':
                cake_labels.append('cake')
            elif cake_json_list[i]['ingredients'][j]['ingredient'] == 'ice_cream':
                cake_labels.append('cake')
            else:
                cake_name = cake_json_list[i]['ingredients'][j]['ingredient'] + '_cake'
                cake_labels.append(cake_name)
            break
        else:
            pass
        
    if len(cake_labels) != i+1:
        cake_labels.append('cake')
    else:
        pass
        
print(len(cake_labels))

In [None]:
pasta_labels = []

for i in range(len(pasta_json_list)):
    for j in range(len(pasta_json_list[i]['ingredients'])):
        if pasta_json_list[i]['ingredients'][j]['subtype'] == 'soup_sauce_spice':
            if pasta_json_list[i]['ingredients'][j]['ingredient'] == 'tomato_sauce':
                pasta_labels.append('tomato_sauce_pasta')
            elif pasta_json_list[i]['ingredients'][j]['ingredient'] == 'cream_sauce':
                pasta_labels.append('cream_sauce_pasta')
            elif pasta_json_list[i]['ingredients'][j]['ingredient'] == 'oil':
                pasta_labels.append('oil_pasta')
            elif pasta_json_list[i]['ingredients'][j]['ingredient'] == 'ketchup':
                pasta_labels.append('tomato_sauce_pasta')
            else:
                pasta_labels.append('pasta')
            break
        else:
            pass
        
    if len(pasta_labels) != i+1:
        pasta_labels.append('pasta')
    else:
        pass
        
print(len(pasta_labels))

In [None]:
all_image_paths = image_paths + cake_image_paths + pasta_image_paths
all_labels = labels + cake_labels + pasta_labels

print(len(all_image_paths), len(all_labels))

In [None]:
image_series = pd.Series(all_image_paths, name='imagepath').astype(str)
labels_series = pd.Series(all_labels, name='label')

images_df = pd.concat([image_series, labels_series], axis=1)

In [None]:
images_df.head()

In [None]:
images_df['label'].value_counts()

In [None]:
category_samples = []
for category in images_df['label'].unique():
    category_slice = images_df.query("label == @category")
    category_samples.append(category_slice.sample(100, random_state=42, replace=True))
sample_df = pd.concat(category_samples, axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
sample_df.head()

In [None]:
sample_df['label'].value_counts()

#### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
sample_train_df, sample_test_df = train_test_split(sample_df, test_size=0.2, shuffle=True, random_state=42)

#### Creating Generators

In [None]:
import tensorflow as tf

In [None]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
    validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
)

In [None]:
sample_train_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='training'
)

sample_val_images = train_generator.flow_from_dataframe(
    dataframe=sample_train_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='validation'
)

sample_test_images = test_generator.flow_from_dataframe(
    dataframe=sample_test_df,
    x_col='imagepath',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

In [None]:
sample_train_images

#### Modeling