**解析豆瓣读书top 250 - 首页单本书数据**

In [None]:
import requests
from pyquery import PyQuery as pq
import re
import csv

In [None]:
url = 'https://book.douban.com/top250'

In [None]:
def get_html(url):         # 添加：定义函数名
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
           Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'}
    resp = requests.get(url,headers=headers)
    html = resp.text
    return html            # 添加：返回网址的html数据

In [4]:
book_html = """
<tr class="item">
          <td width="100" valign="top">
            <a class="nbg" href="https://book.douban.com/subject/1007305/" onclick="moreurl(this,{i:'0'})">
              <img src="https://img1.doubanio.com/view/subject/s/public/s1070959.jpg" width="90"/>
            </a>
          </td>
          <td valign="top">
            
            <div class="pl2">


              <a href="https://book.douban.com/subject/1007305/" onclick="&quot;moreurl(this,{i:'0'})&quot;" title="红楼梦">
                红楼梦

                
              </a>



                  <img src="/pics/read.gif" alt="可试读" title="可试读"/>

              
            </div>

              <p class="pl">[清] 曹雪芹 著 / 人民文学出版社 / 1996-12 / 59.70元</p>

            

              
              <div class="star clearfix">
                  <span class="allstar50"/>
                  <span class="rating_nums">9.6</span>

                <span class="pl">(
                    413885人评价
                )</span>
              </div>

            
              <p class="quote" style="margin: 10px 0; color: #666">
                  <span class="inq">都云作者痴，谁解其中味？</span>
              </p>


          </td>
        </tr>
"""


# 数据解析

In [None]:
# 第一步先将字符串转化为pyquery对象
doc = pq(book_html)

------------------------------ 

- **书名**

In [5]:
book_name = doc('div.pl2 a').text()
print(book_name)

红楼梦


--------------------------------------------------------------

- **```基本信息```**

In [6]:
info = doc('p.pl').text()
print(info)

[清] 曹雪芹 著 / 人民文学出版社 / 1996-12 / 59.70元


In [7]:
info_list = info.split('/')
info_list

['[清] 曹雪芹 著 ', ' 人民文学出版社 ', ' 1996-12 ', ' 59.70元']

- **价格**

In [8]:
price = info_list[-1]
price

' 59.70元'

- **出版时间**

In [9]:
pub_time = info_list[-2]
pub_time

' 1996-12 '

- **出版社**


In [10]:
publisher = info_list[-3]
publisher

' 人民文学出版社 '

- **作者**

In [11]:
authors = ','.join(info_list[:-3])
authors

'[清] 曹雪芹 著 '

------------------------------------------------------------

- **评分**

In [12]:
rating_nums = doc('span.rating_nums').text()
rating_nums

'9.6'

- **评分人数**

In [13]:
rating_count_raw = doc('span.pl').text()
rating_count_raw

'( 413885人评价 )'

In [14]:
rating_count = re.findall('\d+', rating_count_raw)[0]
rating_count

'413885'

- **介绍**

In [15]:
quote = doc('p.quote span.inq').text()
quote

'都云作者痴，谁解其中味？'

------------------------------ 

## 合并版：整合数据解析的代码

In [None]:
doc = pq(book_html)

book_name = doc('div.pl2 a').text() # 书名

info = doc('p.pl').text() # 作者/出版社/出版时间/价格
info_list = info.split('/')
price = info_list[-1] # 价格
pub_time = info_list[-2] # 出版时间
publisher = info_list[-3] # 出版社
authors = ','.join(info_list[:-3]) # 作者

rating_nums = doc('span.rating_nums').text() # 评分

rating_count_raw = doc('span.pl').text() 
rating_count = re.findall('\d+', rating_count_raw)[0] # 评分人数

quote = doc('p.quote span.inq').text() # 介绍


# 将以上结果保存为一个字典

In [16]:
book_info = {
    'book_name': book_name,
    'price': price,
    'pub_time': pub_time,
    'publisher': publisher,
    'authors': authors,
    'rating_nums': rating_nums,
    'rating_count': rating_count,
    'quote': quote
}

print(book_info)

{'book_name': '红楼梦', 'price': ' 59.70元', 'pub_time': ' 1996-12 ', 'publisher': ' 人民文学出版社 ', 'authors': '[清] 曹雪芹 著 ', 'rating_nums': '9.6', 'rating_count': '413885', 'quote': '都云作者痴，谁解其中味？'}


# 将字典内容保存为本地csv格式文件

In [None]:
# 打开名为 'book_info.csv' 的文件，如果不存在则创建，并使用读写模式 'a+'，同时指定编码方式为 'utf-8'，并设置 newline 参数为空，以避免额外的空行
file = open('book_info.csv', 'a+', encoding='utf-8', newline='')

# 从 book_info 字典中获取字段名列表，用于写入CSV文件的标题行
fieldnames = book_info.keys()

# 创建一个 DictWriter 对象，用于将字典数据写入CSV文件，传入文件对象 file 和字段名列表 fieldnames
writer = csv.DictWriter(file, fieldnames=fieldnames)

# 写入CSV文件的标题行，即字段名列表
writer.writeheader()

# 将 book_info 字典写入CSV文件
writer.writerow(book_info)

# 关闭文件
file.close()

In [None]:
# 另外一种写法
# with open('book_info.csv', 'a+', encoding='utf-8', newline='') as file:
#     fieldnames = book_info.keys()
#     writer = csv.DictWriter(file, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerow(book_info)