In [39]:
import os
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED # multithreading

In [40]:
download_path = "./Projects/DoubanPic"
if not os.path.exists(download_path):
    os.makedirs(download_path)
    
def download_pic(url):
    ua = UserAgent()
    headers = {'User-Agent':ua.chrome}
    r = requests.get(url, headers = headers)
    soup = BeautifulSoup(r.text, 'lxml')
    
    content = soup.find('div', class_ = 'article') # main div
    image = content.find_all("img") # all pic
    pic_list = [img["src"] for img in image] # all scr link
    pic_names = [img["alt"] for img in image] # name of movie
    
    for link, name in zip(pic_list, pic_names):
        urlretrieve(link, f"{download_path}/{name}.jpg") # download
    
    print(f"{url} all pictures downlaoded")
        

In [41]:
def main():
    start_urls = ["https://movie.douban.com/top250"]
    
    for i in range(1,10):
        start_urls.append(f"https://movie.douban.com/top250?start={i * 25}&filter=")
    
    start_time = time.time()
    
    #########
    # Normal processing
    #for url in start_urls:
    #    download_pic(url)
    #########
    
    
    #########
    # Multithreading processing
    with ThreadPoolExecutor(max_workers = 10) as executor:
        futures = []
        for url in start_urls:
            future = executor.submit(download_pic, url)
            futures.append(future)
            
    # wait for all
    wait(futures, return_when = ALL_COMPLETED)
    
    #########
    
    
    end_time = time.time()
    
    print(f"running time: {end_time - start_time}")

In [42]:
if __name__ == "__main__":
    main()

https://movie.douban.com/top250?start=125&filter= all pictures downlaoded
https://movie.douban.com/top250?start=150&filter= all pictures downlaoded
https://movie.douban.com/top250 all pictures downlaodedhttps://movie.douban.com/top250?start=225&filter= all pictures downlaoded

https://movie.douban.com/top250?start=100&filter= all pictures downlaoded
https://movie.douban.com/top250?start=50&filter= all pictures downlaoded
https://movie.douban.com/top250?start=25&filter= all pictures downlaoded
https://movie.douban.com/top250?start=200&filter= all pictures downlaoded
https://movie.douban.com/top250?start=75&filter= all pictures downlaoded
https://movie.douban.com/top250?start=175&filter= all pictures downlaoded
running time: 7.350672960281372
