### Data setup

Creates the proper directory structure to use with Keras method `.flow_from_directory` (the directories for the training images and the validation images should each contain one subdirectory per class with PNG or JPG images).

In [2]:
import glob, os, shutil, requests
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

from subprocess import call
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

---
## Additional pizza images
Get additional pizza images from [ImageNet](http://image-net.org/synset?wnid=n07873807) to add to pizza train/validation directories.

In [3]:
url = "http://image-net.org/api/text/imagenet.synset.geturls?wnid=n07873807"
response = requests.get(url)

# Pull HTML string out of requests
html = response.text

In [4]:
for image_url in response.text.split():
    print image_url
    call(['wget', image_url, '-P', '../../moar_pizza/'])

http://farm1.static.flickr.com/73/227675530_b97ea8210f.jpg
http://farm4.static.flickr.com/3462/3994490118_4ae4cf3f8e.jpg
http://farm3.static.flickr.com/2422/4059077353_b7a3ec05f3.jpg
http://farm4.static.flickr.com/3360/3197291062_30a65d13a1.jpg
http://farm1.static.flickr.com/46/146639031_31723b0ff7.jpg
http://farm4.static.flickr.com/3372/3182498543_8a306e6aeb.jpg
http://farm1.static.flickr.com/51/152996006_66ff40702c.jpg
http://farm4.static.flickr.com/3154/2282810258_b2dc39a560.jpg
http://farm4.static.flickr.com/3207/2714395801_d31ce40dab.jpg
http://farm4.static.flickr.com/3254/2460137196_cfbe933ee2.jpg
http://farm3.static.flickr.com/2415/2880231196_f41bd9732e.jpg
http://farm1.static.flickr.com/107/298492272_144bdd7c7d.jpg
http://farm3.static.flickr.com/2776/4129592861_63edd5c277.jpg
http://farm1.static.flickr.com/30/43649245_b0aaa983e4.jpg
http://farm4.static.flickr.com/3178/2588264572_c3f1130448.jpg
http://farm4.static.flickr.com/3288/2982064467_330d5a1c0d.jpg
http://farm1.static.fli

KeyboardInterrupt: 

---
## Not pizza images
2000 images labeled not pizza, 1600 for train and 800 for validation.

In [2]:
not_pizza = [x for x in glob.glob('/Users/VanessaG/Desktop/food_images/*') if x.endswith('.jpg')]
not_pizza = np.random.choice(not_pizza, 2000, replace=False)
not_pizza_train, not_pizza_val = train_test_split(not_pizza, train_size=(1600/2000.))

In [3]:
len(not_pizza_train), len(not_pizza_val)

(1600, 400)

In [4]:
#train set
not_pizza_train_dest = '/Users/VanessaG/Desktop/pizza_class_data/train/not_pizza/'
for i, image in enumerate(not_pizza_train):
    shutil.copy2(image, not_pizza_train_dest)

In [5]:
len([x for x in glob.glob('/Users/VanessaG/Desktop/pizza_class_data/train/not_pizza/*')])

1600

In [6]:
#validation set
not_pizza_val_dest = '/Users/VanessaG/Desktop/pizza_class_data/validation/not_pizza/'
for i, image in enumerate(not_pizza_val):
    shutil.copy2(image, not_pizza_val_dest)

In [7]:
len([x for x in glob.glob('/Users/VanessaG/Desktop/pizza_class_data/validation/not_pizza/*') if x.endswith('.jpg')])

400

---
## Pizza images
2000 images labeled pizza, 1600 for train and 800 for validation.

In [8]:
moar_pizza = [x for x in glob.glob('/Users/VanessaG/Desktop/moar_pizza/*') if x.endswith('.jpg')]
#remove files less than 5kb
for f in moar_pizza:
    if (os.path.getsize(f)/1024. < 5):
        os.remove(f)   

In [9]:
pizza = [x for x in glob.glob('/Users/VanessaG/Desktop/DSI-SF-2-vnessified/capstone/data_sources/food-101/images/pizza/*') if x.endswith('.jpg')]

In [10]:
all_pizza = pizza + moar_pizza
pizza = np.random.choice(all_pizza, 2000, replace=False)
pizza_train, pizza_val = train_test_split(pizza, train_size=(1600/2000.))

In [11]:
len(pizza_train), len(pizza_val)

(1600, 400)

In [12]:
#train set
pizza_train_dest = '/Users/VanessaG/Desktop/pizza_class_data/train/pizza/'
for i, image in enumerate(pizza_train):
    shutil.copy2(image, pizza_train_dest)

In [13]:
len([x for x in glob.glob('/Users/VanessaG/Desktop/pizza_class_data/train/pizza/*') if x.endswith('.jpg')])

1600

In [14]:
#for validation folder
pizza_val_dest = '/Users/VanessaG/Desktop/pizza_class_data/validation/pizza/'
for i, image in enumerate(pizza_val):
    shutil.copy2(image, pizza_val_dest)

In [15]:
len([x for x in glob.glob('/Users/VanessaG/Desktop/pizza_class_data/validation/pizza/*') if x.endswith('.jpg')])

400