### 导入所需模块

In [None]:
import requests
import csv
import os
import re
from lxml import etree

### 前置准备
#### 目的url地址 'https://movie.douban.com/top250?start=0&filter='
#### 之后每隔25条数据下一页，即start=25 

In [None]:
# 目的url地址
totalUrl = 'https://movie.douban.com/top250?'
headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.197.400 QQBrowser/11.7.5287.400'
}

### 创建记录数据文件，仅运行一次,
#### 重新爬取时删除file文件

In [None]:
os.makedirs('./file')
# 新建csv文件
if not os.path.exists('./file/豆瓣电影top250.csv'):
    with open('./file/豆瓣电影top250.csv','w',newline='') as writer_f:
        writer = csv.writer(writer_f)
        writer.writerow(['title', 'directors', 'ratingValue', 'casts', 'cover', 'detailLink', 'year', 'types', 'countrys', 'lang', 'datePublished', 'movieTime', 'starsLen', 'stars', 'summary', 'imgList', 'movieUrl'])
# 新建爬取条数txt
if not os.path.exists('./file/page.txt'):
    with open('./file/page.txt', 'w', encoding='utf-8') as f:
        f.write('0\n')

### 保存读取当前页

In [None]:
def get_page():
    with open('./file/page.txt', 'r') as r_f:
        return r_f.readlines()[-1]
def set_page(newPage):
    with open('./file/page.txt', 'a') as w_f:
        w_f.write(str(newPage) + '\n')
def save_to_csv(resultList):
    with open('./file/豆瓣电影top250.csv','a',encoding='utf-8',newline='') as writer_f:
        writer = csv.writer(writer_f)
        for rowData in resultList:
            writer.writerow(rowData)

### 爬取函数
#### 爬取总数据页面,找到当前页所有的详细电影的url地址
#### 再在每个电影详细url地址中爬取具体所需的信息

In [None]:
def douBanSpider():
    page = get_page()
    params = {
        'start': int(page) * 25
    }
    respTotalHTML = requests.get(totalUrl, headers = headers, params = params)
    respTotalHTMLXpath = etree.HTML(respTotalHTML.text)
    urlList = respTotalHTMLXpath.xpath('//div[@class="hd"]/a/@href')
    resultList = []
    for index in range(len(urlList)):
        resultData = []
        respDetailHTML = requests.get(urlList[index], headers = headers)
        respDetailHTMLXpath = etree.HTML(respDetailHTML.text)
        # 电影名字
        title = respDetailHTMLXpath.xpath('//span[@property="v:itemreviewed"]/text()')
        resultData.append(title[0])
        # 导演名称
        directors = respDetailHTMLXpath.xpath('(//span[@class="attrs"])[1]/a/text()')
        resultData.append(','.join(directors))
        # 评分
        ratingValue = respDetailHTMLXpath.xpath('//div[@class="rating_self clearfix"]/strong/text()')
        resultData.append(ratingValue[0])
        # 电影演员
        casts = respDetailHTMLXpath.xpath('//div[@id="info"]/span[3]/span[2]/a/text()')
        resultData.append(','.join(casts))
        # 电影封面
        cover = respDetailHTMLXpath.xpath('//div[@id="mainpic"]/a/img/@src')
        resultData.append(cover[0])
        # 电影详细链接
        detailLink = urlList[index]
        resultData.append(detailLink)
        # 电影年份
        year = respDetailHTMLXpath.xpath('//span[@class="year"]/text()')[0].strip('()')
        resultData.append(year)
        # 电影类型
        types = respDetailHTMLXpath.xpath('//span[@property="v:genre"]/text()')
        resultData.append(','.join(types))
        # 电影国家
        countrys = re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br/>', respDetailHTML.text)
        countrys = countrys[0].split('/')
        resultData.append(','.join([country.strip() for country in countrys]))
        # 电影语言
        lang = re.findall('<span class="pl">语言:</span>(.*?)<br/>', respDetailHTML.text)
        lang = lang[0].split('/')
        resultData.append(','.join([l.strip() for l in lang]))
        # 电影上映时间
        datePublished = respDetailHTMLXpath.xpath('//span[@property="v:initialReleaseDate"]/text()')
        resultData.append(','.join(datePublished))
        # 电影片长
        movieTime = respDetailHTMLXpath.xpath('//span[@property="v:runtime"]/text()')
        resultData.append(','.join(movieTime))
        # 评分人数
        starsLen = respDetailHTMLXpath.xpath('//div[@class="rating_sum"]/a/span/text()')
        resultData.append(starsLen[0])
        # 电影星级占比
        stars = respDetailHTMLXpath.xpath('//div[@class="ratings-on-weight"]/div[@class="item"]/span[@class="rating_per"]/text()')
        resultData.append(','.join(stars))
        # 电影信息介绍
        summary = respDetailHTMLXpath.xpath('//span[@property="v:summary"]/text()')[0].strip()
        resultData.append(summary)
        # 图片列表
        imgList = respDetailHTMLXpath.xpath('//div[@id="related-pic"]/ul/li/a/img/@src')
        resultData.append(','.join(imgList))
        # 预告片链接
        movieUrl = respDetailHTMLXpath.xpath('//a[@class="related-pic-video"]/@href')
        resultData.append(','.join(movieUrl))
        
        
        resultList.append(resultData)
        print(f'爬取第{(int(page) * 25 + index + 1)}部电影完成 - {title[0]}')
        
     
    save_to_csv(resultList)
    if(int(page) + 1 >= 10):
        print('豆瓣top250电影爬取完成')
    else:
        set_page(int(page) + 1)
        douBanSpider()
        
        

### 运行

In [None]:
douBanSpider()