In [1]:
import os
import sys
import traceback
from mimetypes import guess_extension
from time import time, sleep
from urllib.request import urlopen, Request
from urllib.parse import quote
from bs4 import BeautifulSoup

In [2]:
MY_EMAIL_ADDR = 'takanakahiko@gmail.com'

In [3]:
class Fetcher:
    def __init__(self, ua=''):
        self.ua = ua

    def fetch(self, url):
        req = Request(url, headers={'User-Agent': self.ua})
        try:
            with urlopen(req, timeout=3) as p:
                b_content = p.read()
                mime = p.getheader('Content-Type')
        except:
            sys.stderr.write('Error in fetching {}\n'.format(url))
            sys.stderr.write(traceback.format_exc())
            return None, None
        return b_content, mime

fetcher = Fetcher(MY_EMAIL_ADDR)

In [4]:
def fetch_and_save_img(word):
    data_dir = '../data/'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    for i, img_url in enumerate(img_url_list(word)):
        sleep(0.1)
        img, mime = fetcher.fetch(img_url)
        if not mime or not img:
            continue
        ext = guess_extension(mime.split(';')[0])
        if ext in ('.jpe', '.jpeg'):
            ext = '.jpg'
        if not ext:
            continue
        result_file = os.path.join(data_dir, str(i) + ext)
        with open(result_file, mode='wb') as f:
            f.write(img)
        print('fetched', img_url)

In [5]:
def img_url_list(word):
    """
    using yahoo (this script can't use at google)
    """
    url = 'http://image.search.yahoo.co.jp/search?n=60&p={}&search.x=1'.format(quote(word))
    byte_content, _ = fetcher.fetch(url)
    structured_page = BeautifulSoup(byte_content.decode('UTF-8'), 'html.parser')
    img_link_elems = structured_page.find_all('a', attrs={'target': 'imagewin'})
    img_urls = [e.get('href') for e in img_link_elems if e.get('href').startswith('http')]
    img_urls = list(set(img_urls))
    return img_urls

In [6]:
fetch_and_save_img('猫')

fetched http://3.bp.blogspot.com/-_r1FI94rbMs/Un2n3UqE4JI/AAAAAAAA53w/lcWc6uQruFs/s1600/IMG_5160.JPG
fetched http://farm4.static.flickr.com/3177/3015234038_31092c7e53.jpg
fetched http://blogimg.goo.ne.jp/user_image/73/9e/4c1fb20773c8e6206b6c6b084e5be575.jpg
fetched http://livedoor.blogimg.jp/tokyocat_g/imgs/f/b/fb7b897e.jpg
fetched https://imgcp.aacdn.jp/img-a/800/auto/aa/gm/article/3/1/8/0/6/201801222224/neko2.jpg
fetched http://farm1.static.flickr.com/75/182107227_f85c3c1445.jpg
fetched http://farm3.static.flickr.com/2067/2149377025_8999d9c91a.jpg
fetched http://livedoor.blogimg.jp/loveai0221/imgs/c/c/ccfbbd61.jpg
fetched http://www.nekohouse.jp/photo/0901/01/18-03b.jpg
fetched https://upload.wikimedia.org/wikipedia/commons/thumb/9/9e/2016-06-14_Orange_and_white_tabby_cat_born_in_2016_%E8%8C%B6%E3%83%88%E3%83%A9%E7%99%BD%E3%81%AD%E3%81%93_DSCF6526%E2%98%86%E5%BD%A1.jpg/200px-2016-06-14_Orange_and_white_tabby_cat_born_in_2016_%E8%8C%B6%E3%83%88%E3%83%A9%E7%99%BD%E3%81%AD%E3%81%93_DSCF