# 通过关键词和图片搜索引擎下载图片

通过在搜索引擎中搜索给定关键词，然后对返回结果的静态网页版本的源代码进行匹配找到所有图片 URL 进行下载。

** 实现思路 **

- 先在一个文本文件中以 UTF-8 格式保存要下载的关键词列表
- 然后编写脚本自动为每一个关键词分配多进程，并行进行网页源码获取和图片下载。

下面我以美食为例来说明：

In [8]:
import os
import re
import urllib
from multiprocessing import Process

In [2]:
root = 'D:/datasets/food/'

In [5]:
if not os.path.exists(root):
    os.mkdir(root)  # 创建数据根目录

In [6]:
keywords_list = ['烤鸭', '鹿肉', '地锅鸡', '包子', '水饺', '麻花', '鸡汤']

将 `keywords_list` 写入磁盘：

In [7]:
with open(f'{root}keywords.txt', 'w') as fp:
    for k in keywords_list:
        fp.write(k+'\n')

我仅需要获取图片的格式是：

In [9]:
IMAGE_FORMAT = ['jpg', 'png', 'jpeg']

## 定义搜索引擎获取关键词和指定样本的模板的 API

```py
import os
import re
import urllib
from multiprocessing import Process


# {keyword} 是关键词字段
# {index} 是图片的开始下标字段
SUPPORTED_FORMATS = ['jpg', 'png', 'jpeg']
URL_TEMPLATE = r'http://image.b***u.com/search/flip?tn=b***uimage&ie=utf-8&word={keyword}&pn={index}'

# 定义每个进程的下载函数
def download_images_from_baidu(dir_name, keyword, start_index, end_index):
    '''
    dir_name: 文件要保存的位置
    keyword：关键词
    start_index：要下载文件的开始编号
    end_index：要下载文件的结束编号
    '''
    index = start_index
    while index < end_index:
        url = URL_TEMPLATE.format(keyword=keyword, index=index)
        try:
            html_text = urllib.urlopen(url).read().decode('utf-8', 'ignore')
            image_urls = re.findall(r'"objURL":"(.*?)"', html_text)
            if not image_urls:
                print('Cannot retrieve anymore image urls \nStopping ...'.format(url))
                break
        except IOError as e:
            print(e)
            print('Cannot open {}. \nStopping ...'.format(url))
            break

        downloaded_urls = []
        for url in image_urls:
            filename = url.split('/')[-1]
            ext = filename[filename.rfind('.')+1:]
            if ext.lower() not in SUPPORTED_FORMATS:
                index += 1
                continue
            filename = '{}/{:0>6d}.{}'.format(dir_name, index, ext)
            cmd = 'wget "{}" -t 3 -T 5 -O {}'.format(url, filename)
            os.system(cmd)
            
            if os.path.exists(filename) and os.path.getsize(filename) > 1024:
                index_url = '{:0>6d},{}'.format(index, url)
                downloaded_urls.append(index_url)
            else:
                os.system('rm {}'.format(filename))

            index += 1
            if index >= end_index:
                break

        with open('{}_urls.txt'.format(dir_name), 'a') as furls:
            urls_text = '{}\n'.format('\n'.join(downloaded_urls))
            if len(urls_text) > 11:
                furls.write(urls_text)

# 启动下载任务的函数
def download_images(keywords, num_per_kw, procs_per_kw):
    args_list = []
    for class_id, keyword in enumerate(keywords):
        dir_name = '{:0>3d}'.format(class_id)
        os.system('mkdir -p {}'.format(dir_name))
        num_per_proc = int(round(float(num_per_kw/procs_per_kw)))
        for i in range(procs_per_kw):
            start_index = i * num_per_proc
            end_index = start_index + num_per_proc - 1
            args_list.append((dir_name, keyword, start_index, end_index))

    processes = [Process(target=download_images_from_baidu, args=x) for x in args_list]

    print('Starting to download images with {} processes ...'.format(len(processes)))

    for p in processes:
        p.start()

    for p in processes:
        p.join()

    print('Done!')

if __name__ == "__main__":
    with open('keywords.txt', 'rb') as f:
        foods = f.read().split()
     # 设置每个类别下载目标为 20 000 张，每个类别用 3 个进程下载
    download_images(foods, 20000, 3)
```

将上面的代码封装为 `collect_data.py` 并放置在 `keywords.txt` 同一目录下。

In [10]:
import urllib