# Preprocessing I

In [3]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import sys

In [6]:
f=h5py.File('/Users/Santiago/Desktop/data/dataset/food_c101_n10099_r64x64x3.h5','r')

In [8]:
f.keys()

<KeysViewHDF5 ['category', 'category_names', 'images']>

In [9]:
f['category']

<HDF5 dataset "category": shape (10099, 101), type "|b1">

In [11]:
f['category_names']

<HDF5 dataset "category_names": shape (101,), type "|S40">

In [12]:
f['images']

<HDF5 dataset "images": shape (10099, 64, 64, 3), type "|u1">

In [13]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
datagenerator = ImageDataGenerator(rescale = 1./255,
                                   rotation_range = 45,
                                   zoom_range = 0.3,
                                   brightness_range = [0.7, 1.3])

In [14]:
train_generator = datagenerator.flow(x =  f['images'], y = f['category'],
                                     batch_size = 1)

In [15]:
len(train_generator)

10099

In [16]:
train_generator[0][0].shape

(1, 64, 64, 3)

# Preprocessing II

In [31]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os   

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

In [34]:
image_files = Path('/Users/Santiago/Desktop/data/dataset/images') 

In [35]:
images = list(image_files.glob(r'**/*.jpg'))
label = list(map(lambda x : os.path.split(os.path.split(x)[0])[1], images))
imagedata = pd.DataFrame({'Files' : images, "label" : label}).astype(str).sample(frac = 1.0, random_state = 1).reset_index(drop=True)

category_samples = []
for category in imagedata['label'].unique():
    category_slice = imagedata.query('label== @category')
    category_samples.append(category_slice.sample(300, random_state=1))
image_df = pd.concat(category_samples, axis=0).sample(frac=1.0, random_state =1).reset_index(drop=True)

In [36]:
image_df['label'].value_counts() 

nachos                 300
spaghetti_bolognese    300
caprese_salad          300
takoyaki               300
shrimp_and_grits       300
                      ... 
ceviche                300
pancakes               300
poutine                300
bruschetta             300
cup_cakes              300
Name: label, Length: 101, dtype: int64

In [37]:
traindata, testdata = train_test_split(image_df, train_size =0.7, random_state =1)

In [38]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function = tf.keras.applications.mobilenet_v2.preprocess_input,
    validation_split=0.2
)

In [39]:
test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function = tf.keras.applications.mobilenet_v2.preprocess_input
)

In [40]:
train_images = train_generator.flow_from_dataframe(
    dataframe=traindata,
    x_col='Files',
    y_col='label',
    target=(224,224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    seed=42,
    shuffle=True,
    subset ='training'
)

Found 16968 validated image filenames belonging to 101 classes.


In [41]:
val_images = train_generator.flow_from_dataframe(
    dataframe=traindata,
    x_col='Files',
    y_col='label',
    target=(224,224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    seed=42,
    shuffle=True,
    subset ='validation'
)

Found 4242 validated image filenames belonging to 101 classes.


In [None]:
test_images = test_generator.flow_from_dataframe(
    dataframe=testdata,
    x_col='Files',
    y_col='label',
    target=(224,224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)