# Capstone Project - Food Classifier

## Overview

The aim of this Capstone Project is to use deep learning to classify images of food. The notebook will include all implementation stages of the project including, data processing, training and building the model and finally an analysis of the results. 

## Software Requirements

The project requires an installation of Anaconda with Python 2. The main libraries use will be Keras with a Tensorflow backend, panda and numpy. 

## Data Processing

The project uses the UECFOOD256 from an academic insitute in Japan, the dataset contains 31,651 images in 256 folders with each folder being a particular category of the food.

In [51]:
from os import listdir
import os.path
import pandas as pd
from sklearn.datasets import load_files    
from sklearn.model_selection import train_test_split
import numpy as np
from shutil import copyfile
from sklearn.datasets import load_files       
from keras.utils import np_utils

raw_dataset = r'UECFOOD256'
categories = r'labels.txt' 
images_folder = r'food-images'
train_folder = os.path.join(images_folder, 'train')
valid_folder = os.path.join(images_folder, 'valid')
test_folder = os.path.join(images_folder, 'test')

def create_folder(folder):
    if not os.path.exists(folder):
        os.mkdir(folder)
        
create_folder(images_folder)
create_folder(train_folder)
create_folder(valid_folder)
create_folder(test_folder)

labels = pd.read_csv(categories, sep=",", header=0)

def get_label_from_raw(raw_label, labels):
    return labels.iloc[int(raw_label)-1]["name"]

### Cross validation

Currently the images are all grouped together within their labeled folders and they need to be seperated into the training, validation and testing sets for each label. First the names of the files are loaded into a collection, then train_test_split is used to obtain the training and testing set. The method is called again to further split the training data into training and validation sets.

A three folders are created: train, valid, test which each contain a subfolder with the correct label and the images from our cross validation split earlier. The methods are defined here:

In [52]:
# Function to confirm the number of files per label
def get_total_images_per_label(path):
    total = 0
    for item in listdir(path):
        image_folder = os.path.join(path, item)
        if os.path.isdir(image_folder):        
            images = [file for file in os.listdir(image_folder) if file.endswith('.jpg')]
            print('Folder ' + item + ' has ' + str(len(images)) + ' images')
            total = total + len(images)
    print('total files ' + str(total))

def train_validate_test_split(path, raw_label, labels):
    label = get_label_from_raw(raw_label, labels)
    image_folder = os.path.join(path, raw_label)
    files = [file for file in os.listdir(image_folder) if file.endswith('.jpg')]  

    X = np.vstack(files)
    y = np.vstack([label] * len(files))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
        
    return X_train, X_val, X_test, y_train,  y_val, y_test

def copy_images_to_folder(src_folder, set_folder, food_label, images):
    target_folder = os.path.join(set_folder, food_label)
    if not os.path.exists(target_folder):
        os.mkdir(target_folder)
    for image in images:        
        src = os.path.join(src_folder, image[0])
        dst = os.path.join(target_folder, image[0])
        copyfile(src, dst)        

The cross validation split is performed here:

In [54]:
X_train, X_val, X_test, y_train,  y_val, y_test = train_validate_test_split(raw_dataset, '1', labels)    

print(str(len(X_train)))
print(str(len(y_train)))
print(str(len(X_val)) )
print(str(len(y_val)) )
print(str(len(X_test)))
print(str(len(y_test)))

raw_label = '1'
food_label = raw_label + '.' + get_label_from_raw(raw_label, labels)
src_folder = os.path.join(raw_dataset, raw_label)
copy_images_to_folder(src_folder, train_folder, food_label, X_train)
copy_images_to_folder(src_folder, valid_folder, food_label, X_val)
copy_images_to_folder(src_folder, test_folder, food_label, X_test)

446
446
112
112
62
62


The data is now loaded:

In [50]:
# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np_utils.to_categorical(np.array(data['target']), len(labels))
    return files, targets

# load train, test, and validation datasets
train_files, train_targets = load_dataset(train_folder)
valid_files, valid_targets = load_dataset(valid_folder)
test_files, test_targets = load_dataset(test_folder)

# print statistics about the dataset
print('There are %d total food categories.' % len(dog_names))
print('There are %s total food images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training food images.' % len(train_files))
print('There are %d validation food images.' % len(valid_files))
print('There are %d test food images.'% len(test_files))

### Image Resize

The first step is to resize all the images in order to standardize the input dimensions for the convolutional neural network. 