# 爬取滑坡图片
---

## 1. 定义函数

In [None]:
import time
import asyncio
import aiofiles
import nest_asyncio

nest_asyncio.apply()
import aiohttp
from tqdm import tqdm
from lxml import etree
from faker import Faker
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options

### 1.1 抓取页面

In [None]:
def grabHtml(url):
    # 使用火狐打开网址，搜索关键字
    web = Firefox(options=option)
    web.get(url)
    search = web.find_element(By.XPATH, '//*[@id="kw"]')
    search.send_keys("滑坡无人机图像", Keys.ENTER)
    time.sleep(2)

    preheight = 0

    while True:
        height = web.execute_script("return action=document.body.scrollHeight")
        if height > preheight:
            web.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(2)
            preheight = height
            epoch = 3
        elif epoch:  # 当高度没有更新时，进入重试逻辑，重试3次，每次等待5秒
            time.sleep(5)
            epoch -= 1
        else:  # 超过重试次数，程序结束跳出循环，并认为页面已经加载完毕！
            print("滚动条已经处于页面最下方！")
            break

    text = web.page_source.encode("utf-8")
    web.quit()
    return text

### 1.2 获取链接

In [None]:
def getHref(text):
    imgs = list()
    tree = etree.HTML(text)
    divs = tree.xpath('//*[@id="imgid"]/div[@class="imgpage"]')
    for div in divs:
        imgitem = div.xpath('./ul/li[@class="imgitem normal"]/a/@href')
        imgs.extend(imgitem)
    return imgs

### 1.3 获取源地址

In [None]:
def getSrc(domain, hrefs):
    srcs = list()
    web = Firefox(options=option)
    for href in tqdm(hrefs, desc="获取src进度"):
        web.get(domain + href)
        src = web.find_elements(by=By.XPATH, value='//*[@id="currentImg"]')
        if src:
            srcs.append(src[0].get_attribute("src") + "\n")
    web.quit()
    return srcs

### 1.4 保存源地址

In [None]:
def saveFile(content, file, mod):
    with open(file, mod) as f:
        f.writelines(content)

### 1.5 下载图片

In [None]:
async def downloadImg(name, src, session):
    async with aiofiles.open(f"../data/Image/Other/20220508_{name:02}.jpg", "wb") as f:
        async with session.get(src, headers={"user-agent": fake.chrome()}) as resp:
            img = await resp.read()
        await f.write(img)


In [None]:
async def main():
    connector = aiohttp.TCPConnector(limit=40)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = {
            asyncio.create_task(downloadImg(name, src.strip(), session))
            for name, src in enumerate(sources)
        }
        await asyncio.wait(tasks)


### 1.6 统计时间

In [None]:
class Timer:
    def __init__(self, func, desc="", *args):
        self.func = func
        self.args = args
        self.desc = desc

    def start(self):
        begin = time.time()
        count = self.func.__code__.co_argcount
        assert count == len(self.args), "参数不匹配"
        try:
            data = self.func(*self.args)
        except ValueError as v:
            print("捕获异常", v)
        else:
            return data
        finally:
            print(f"{self.desc}耗时{(time.time() - begin):.2f}秒")


## 2. 运行脚本

### 2.1 抓取

In [None]:
baidu = "https://image.baidu.com"
fake = Faker()
option = Options()
option.add_argument("--headless")  # 无头浏览器
htmltimer = Timer(grabHtml, "获取页面", baidu)
html = htmltimer.start()

### 2.2 解析

In [None]:
hrefs = getHref(html)
sources = getSrc(baidu, hrefs)
saveFile(sources, "../src.txt", "a")

### 2.3 下载

In [None]:
if __name__ == "__main__":
    downlaodtimer = Timer(asyncio.run, "下载图片", main())
    downlaodtimer.start()