# 导入相关的包

In [6]:
import requests
from pyquery import PyQuery  
import re
import csv

# 生成url

In [7]:
# 函数：生成网址  
# 返回值：url_list为网址列表
def generate_url():   # 添加：定义函数名
    url_list = []
    template = 'https://music.douban.com/top250?start={num}'
    for p in range(1,11):
        url = template.format(num=(p-1)*25)
        url_list.append(url)
    return url_list        # 添加：返回网址列表

# 请求网页数据

In [8]:
# 函数：获得html  get_html(url)
# 参数说明：url为单个网址
# 返回值：html为网址的html数据，即网页源代码的字符串
def get_html(url):         # 添加：定义函数名
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
           Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'}
    resp = requests.get(url,headers=headers)
    html = resp.text
    return html            # 添加：返回网址的html数据

# 解析网页

In [4]:
def extract_musicinfo_list(html):
    musicinfo_list = []   # 生成空列表，用于存储音乐信息
    doc = PyQuery(html)
    for music in doc.items('tr.item'):
        try:
            music_name = music('.pl2 a').text()    # 情况1：直接解析
            score = music('.rating_nums').text()
            
            info_list = music('p.pl').text().split('/')    # 情况2：提取列表元素
            musician = info_list[0]
            pub_time = info_list[1]
            type = info_list[-1]   

            people_num_raw = music('span.pl').text()       # 情况3：正则表达式提取
            people_num = re.findall('[0-9]+', people_num_raw)[0]
            
            musicinfo = {'music_name':music_name,   # 为每张音乐专辑创建一个字典，不同字段建构不同键值对
                        'musician':musician,
                        'pub_time':pub_time,
                        'type':type,
                        'score':score,
                        'people_num':people_num,
                        }
            
            musicinfo_list.append(musicinfo)      # 将字典添加进musicinfo_list列表中
        except:
            pass

    return musicinfo_list

# main 函数

In [16]:
def main(filename):
    print("开始采集豆瓣音乐top 250")
    url_list = generate_url()

    file = open(filename,"a+",encoding="utf-8",newline="")
    fieldnames = ['music_name','musician','pub_time','type','score','people_num']
    writer =csv.DictWriter(file,fieldnames=fieldnames)
    writer.writeheader()

    for url in url_list:
        print("正在采集:{url}".format(url=url))
        html = get_html(url)
        musicinfo_list = extract_musicinfo_list(html)
        for musicinfo in musicinfo_list:
            writer.writerow(musicinfo)
    
    file.close()

    print("采集完毕！")

# 使用main函数开始抓取

In [17]:
main(filename="./data/music_250.csv")

开始采集豆瓣音乐top 250
正在采集:https://music.douban.com/top250?start=0
正在采集:https://music.douban.com/top250?start=25
正在采集:https://music.douban.com/top250?start=50
正在采集:https://music.douban.com/top250?start=75
正在采集:https://music.douban.com/top250?start=100
正在采集:https://music.douban.com/top250?start=125
正在采集:https://music.douban.com/top250?start=150
正在采集:https://music.douban.com/top250?start=175
正在采集:https://music.douban.com/top250?start=200
正在采集:https://music.douban.com/top250?start=225
采集完毕！


**注意：** 第4,5,6页页面只有24张专辑，所以爬取的内容一共只有248条