In [1]:
# https://developers.google.com/custom-search/json-api/v1/reference/cse/list
# https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

In [2]:
%matplotlib inline
import requests
import numpy as np
import pandas as pd
from PIL import Image
from keras.preprocessing.image import img_to_array
from io import BytesIO
from IPython.display import clear_output
import imagehash
from extras import *

Using TensorFlow backend.


### Image retrieval

In [3]:
url = 'https://www.googleapis.com/customsearch/v1/'

params = {
    'q': 'interior design bathroom',
    'num': 10,
    'start': 1,
    'imgSize': 'medium',
    'searchType': 'image',
    'fileType': 'jpg',
    'cx': '011640269314203163488:yn7alr9klxc',
    'key': 'AIzaSyChVx6ZioqCYtN7H2QVVtyYqLdCthezUwc',
    'filter': 1,
    'imgType': 'photo',
    'imgColorType': 'color'
}

In [4]:
def get_image(image_url):
    r = requests.get(image_url)
    image_binary = r.content
    
    try:
        image_array = img_to_array(Image.open(BytesIO(image_binary)))
    except IOError:
        return np.array([])
    
    return image_array

In [5]:
def get_image_batch(index):
    print 'batch %d' % index
    params['start'] = index
    r = requests.get(url, params)
    google_search_json = r.json()
    
    batch_images = np.array([])
    if ('items' in google_search_json):
        batch_images = np.array([get_image(item['link']) for item in google_search_json['items']])
        # Remove empty images
        batch_images = batch_images[np.int_(np.nonzero(batch_images)[0])]
    else: 
        print google_search_json

    return batch_images

### Dataframe construction

In [8]:
searches = [
    { 'query': 'interior design', 'category': 'general'}
    { 'query': 'interior design bed room', 'category': 'bedroom' },
    { 'query': 'interior design living room', 'category': 'living_room' },
    { 'query': 'interior design kitchen', 'category': 'kitchen' },
    { 'query': 'interior design dining room', 'category': 'dining_room' },
    { 'query': 'interior design bathroom', 'category': 'bathroom' },
    { 'query': 'bed room', 'category': 'bedroom' },
    { 'query': 'living room', 'category': 'living_room' },
    { 'query': 'kitchen', 'category': 'kitchen' },
    { 'query': 'dining room', 'category': 'dining_room' },
    { 'query': 'bathroom', 'category': 'bathroom' },
    { 'query': 'bed room ideas', 'category': 'bedroom' },
    { 'query': 'living room ideas', 'category': 'living_room' },
    { 'query': 'kitchen ideas', 'category': 'kitchen' },
    { 'query': 'dining room ideas', 'category': 'dining_room' },
    { 'query': 'bathroom ideas', 'category': 'bathroom' },
]

df = pd.DataFrame({'original': [], 'category': []})
for search  in searches:
    clear_output()
    print 'fetching images for ' + search['category']
    params['q'] = search['query']
#     images = np.array([])
    
    for i in range(1, 1000, 10):
        batch = get_image_batch(i)

#         if batch.size == 0:
#             break
#         images = np.append(images, batch)
    
        df = pd.concat([df, pd.DataFrame({'original': batch, 'category': search['category'] })])
        df.to_pickle('raw_data_clean.pkl')

fetching images for bedroom
batch 1
{u'error': {u'code': 500, u'message': u'Backend Error', u'errors': [{u'domain': u'global', u'message': u'Backend Error', u'reason': u'backendError'}]}}
batch 11
batch 21
batch 31
batch 41
batch 51
batch 61
batch 71
batch 81
batch 91
batch 101
{u'error': {u'code': 403, u'message': u'This API requires billing to be enabled on the project. Visit https://console.developers.google.com/billing?project=470491645907 to enable billing.', u'errors': [{u'domain': u'usageLimits', u'message': u'This API requires billing to be enabled on the project. Visit https://console.developers.google.com/billing?project=470491645907 to enable billing.', u'reason': u'dailyLimitExceeded', u'extendedHelp': u'https://console.developers.google.com/billing?project=470491645907'}]}}
batch 111
{u'error': {u'code': 403, u'message': u'This API requires billing to be enabled on the project. Visit https://console.developers.google.com/billing?project=470491645907 to enable billing.', u'

KeyboardInterrupt: 

In [None]:
df.shape

### Remove images without the proper shape (probably pngs)

In [None]:
df = df[df.original.apply(lambda x: x.shape[2] == 3)]

In [None]:
df.shape

### Remove duplicated images

In [None]:
df['image_hashes'] = df.original.map(lambda x: imagehash.whash(array_to_img(x)).__str__())
df = df.drop_duplicates(subset='image_hashes')

In [None]:
df.shape

### Save data

In [None]:
# df.to_pickle('raw_data.pkl')

### Visualize data

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.category.value_counts()

In [None]:
n_samples = df.shape[0]
show_photos(df.original.iloc[np.random.choice(n_samples, 9)])