In [3]:
from pyquery import PyQuery as pq
import re
import csv

In [4]:
html = """
<div class="pl2">
        <a href="https://music.douban.com/subject/2995812/" onclick="moreurl(this,{i:'0',query:'',subject_id:'2995812',from:'music_subject_search'})" >
            We Sing. We Dance. We Steal Things.
       </a>

            <p class="pl">Jason Mraz / 2008-05-13 / Import / Audio CD / 民谣</p>

        
        
                    <div class="star clearfix"><span class="allstar45"></span><span class="rating_nums">9.1</span>
                <span class="pl">
                    (
                            116386人评价
                    )
                </span></div>
"""

doc = pq(html)

# 数据解析

- **音乐名**

In [5]:
music_name = doc('div.pl2 a').text()
print(music_name)

We Sing. We Dance. We Steal Things.


--------------------------------------------------------------

- **```基本信息```**

In [6]:
info = doc('p.pl').text()
print(info)

Jason Mraz / 2008-05-13 / Import / Audio CD / 民谣


In [7]:
info_list = info.split('/')
info_list

['Jason Mraz ', ' 2008-05-13 ', ' Import ', ' Audio CD ', ' 民谣']

- **音乐类型**

In [None]:
music_type = info_list[-1]
music_type

' 民谣'

- **发行时间**

In [9]:
pub_time = info_list[1]
pub_time

' 2008-05-13 '

- **音乐人**

In [10]:
musician = info_list[0]
musician

'Jason Mraz '

------------------------------------------------------------

- **评分**

In [11]:
rating_nums = doc('span.rating_nums').text()
rating_nums

'9.1'

- **评分人数**

In [12]:
rating_count_raw = doc('span.pl').text()
rating_count_raw

'( 116386人评价 )'

In [13]:
rating_count = re.findall('\d+', rating_count_raw)[0]
rating_count

'116386'

## 以上代码整合在一起

In [None]:
music_name = doc('div.pl2 a').text()

info = doc('p.pl').text()
info_list = info.split('/')
music_type = info_list[-1]
pub_time = info_list[1]
musician = info_list[0]

rating_nums = doc('span.rating_nums').text()

rating_count_raw = doc('span.pl').text()
rating_count = re.findall('\d+', rating_count_raw)[0]



# 将以上结果保存为一个字典

In [None]:
music_info = {
    'music_name': music_name,
    'music_type': music_type,
    'pub_time': pub_time,
    'musician': musician,
    'rating_nums': rating_nums,
    'rating_count': rating_count
}

print(music_info)

{'music_name': 'We Sing. We Dance. We Steal Things.', 'type': ' 民谣', 'pub_time': ' 2008-05-13 ', 'musician': 'Jason Mraz ', 'rating_nums': '9.1', 'rating_count': '116386'}


# 将字典内容保存为本地csv格式文件

In [29]:
# 打开名为 'music_info.csv' 的文件，如果不存在则创建，并使用读写模式 'a+'，同时指定编码方式为 'utf-8'，并设置 newline 参数为空，以避免额外的空行
file = open('music_info.csv', 'a+', encoding='utf-8', newline='')

# 从 music_info 字典中获取字段名列表，用于写入CSV文件的标题行
fieldnames = music_info.keys()

# 创建一个 DictWriter 对象，用于将字典数据写入CSV文件，传入文件对象 file 和字段名列表 fieldnames
writer = csv.DictWriter(file, fieldnames=fieldnames)

# 写入CSV文件的标题行，即字段名列表
writer.writeheader()

# 将 music_info 字典写入CSV文件
writer.writerow(music_info)

# 关闭文件
file.close()

# 另一种写法
# with open('music_info.csv', 'a+', encoding='utf-8', newline='') as file:
#     writer = csv.DictWriter(file, fieldnames=music_info.keys()) # 创建CSV写入器，指定字段名
#     writer.writeheader() # 写入CSV文件的标题行
#     writer.writerow(music_info) # writer.writerow(music_info)