In [1]:
# https://developers.google.com/custom-search/json-api/v1/reference/cse/list
# https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

In [7]:
%matplotlib inline
import requests
import numpy as np
import pandas as pd
from PIL import Image
from keras.preprocessing.image import img_to_array
from io import BytesIO
from IPython.display import clear_output
import imagehash
from extras import *
from keys import GOOGLE_SEARCH_KEY

### Image retrieval

In [8]:
url = 'https://www.googleapis.com/customsearch/v1/'

params = {
    'q': 'interior design bathroom',
    'num': 10,
    'start': 1,
    'imgSize': 'medium',
    'searchType': 'image',
    'fileType': 'jpg',
    'cx': '011640269314203163488:yn7alr9klxc',
    'key': GOOGLE_SEARCH_KEY,
    'filter': 1,
    'imgType': 'photo',
    'imgColorType': 'color'
}

In [10]:
def get_image(image_url):
    r = requests.get(image_url)
    image_binary = r.content
    
    try:
        image_array = img_to_array(Image.open(BytesIO(image_binary)))
    except IOError:
        return np.array([])
    
    return image_array

In [11]:
def get_image_batch(index):
    error = None
    params['start'] = index
    r = requests.get(url, params)
    google_search_json = r.json()
    
    batch_images = np.array([])
    if ('items' in google_search_json):
        batch_images = np.array([get_image(item['link']) for item in google_search_json['items']])
        # Remove empty images
        batch_images = batch_images[np.int_(np.nonzero(batch_images)[0])]
    else: 
        error = google_search_json

    return batch_images, error

### Dataframe construction

In [None]:
df = pd.DataFrame({'original': [], 'category': []})

In [20]:
searches = [
#     { 'query': 'interior design', 'category': 'general'},
#     { 'query': 'interior design bed room', 'category': 'bedroom' },
#     { 'query': 'interior design living room', 'category': 'living_room' },
#     { 'query': 'interior design kitchen', 'category': 'kitchen' },
#     { 'query': 'interior design dining room', 'category': 'dining_room' },
#     { 'query': 'interior design bathroom', 'category': 'bathroom' },
#     { 'query': 'bed room', 'category': 'bedroom' },
#     { 'query': 'living room', 'category': 'living_room' },
    { 'query': 'kitchen', 'category': 'kitchen' },
    { 'query': 'dining room', 'category': 'dining_room' },
    { 'query': 'bathroom', 'category': 'bathroom' },
    { 'query': 'bed room ideas', 'category': 'bedroom' },
    { 'query': 'living room ideas', 'category': 'living_room' },
    { 'query': 'kitchen ideas', 'category': 'kitchen' },
    { 'query': 'dining room ideas', 'category': 'dining_room' },
    { 'query': 'bathroom ideas', 'category': 'bathroom' },
    { 'query': 'modern bed room', 'category': 'bedroom' },
    { 'query': 'modern living room', 'category': 'living_room' },
    { 'query': 'modern kitchen', 'category': 'kitchen' },
    { 'query': 'modern dining room', 'category': 'dining_room' },
    { 'query': 'modern bathroom', 'category': 'bathroom' },
]

for search  in searches:
    clear_output()
    print 'fetching images for ' + search['query']
    params['q'] = search['query']
    
    for i in range(1, 1000, 10):
        print 'batch %d' % i
        batch, error = get_image_batch(i)
        if error:
            print error
            break

        df = pd.concat([df, pd.DataFrame({'original': batch, 'category': search['category'] })])

fetching images for modern bathroom
batch 1
batch 11
batch 21
batch 31
batch 41
batch 51
batch 61
batch 71
batch 81
batch 91
batch 101
{u'error': {u'code': 400, u'message': u'Invalid Value', u'errors': [{u'domain': u'global', u'message': u'Invalid Value', u'reason': u'invalid'}]}}


In [21]:
df.shape

(1434, 2)

### Remove images without the proper shape (probably pngs)

In [22]:
df = df[df.original.apply(lambda x: x.shape[2] == 3)]

In [23]:
df.shape

(1432, 2)

### Remove duplicated images

In [24]:
df['image_hashes'] = df.original.map(lambda x: imagehash.whash(array_to_img(x)).__str__())
df = df.drop_duplicates(subset='image_hashes')

In [25]:
df.shape

(1232, 3)

### Save data

In [26]:
# df.to_pickle('raw_data.pkl')

### Visualize data

In [27]:
df.shape

(1232, 3)

In [28]:
df.columns

Index([u'category', u'original', u'image_hashes'], dtype='object')

In [29]:
df.category.value_counts()

bathroom       294
bedroom        251
kitchen        228
living_room    208
dining_room    165
general         86
Name: category, dtype: int64

In [31]:
n_samples = df.shape[0]
show_photos(df.original.iloc[np.random.choice(n_samples, 9)])

Widget Javascript not detected.  It may not be installed or enabled properly.
