# two way to get books' info
# 1. xpath
# 2. beautifulsoup

In [1]:
from bs4 import  BeautifulSoup
import requests
from lxml import etree
import json


class BookSpider(object):

    def __init__(self):

        self.basic_url = "https://www.books.com.tw/web/cebook_new/?o=1&v=1&page={}"

        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
                        }

        self.book_data = []

    def get_response(self, url):

        data = requests.get(url, headers=self.headers).content.decode("utf-8")

        # print(data)

        return data

    def parse_data_xpath(self, data):

        data_xpath = etree.HTML(data)

        book_title = data_xpath.xpath('//div[@class="item"]')

        # print(len(book_title))

        for book in book_title:

            book_dict = dict()

            book_dict["book_name"] = book.xpath('.//div[@class = "msg"]/h4/a/text()')[0]
            # print(book_dict)
            book_dict["book_author"] = book.xpath('.//div[@class = "msg"]//li[@class = "info"]/a/text()')[0]
            # print(book_dict)
            book_dict["book_pic"] = book.xpath('.//a/img/@src')[0]
            # print(book_dict)
            try:
                book_dict["book_info"] = (book.xpath('.//p/text()'))[0]
            except IndexError:
                book_dict["book_info"] = "no info"

            self.book_data.append(book_dict)
        print(self.book_data)

    def parse_data_bs4(self, data):

        soup = BeautifulSoup(data, "lxml")

        book_list = soup.select("div.item")

        for book in book_list:

            book_dict = {}
            try:
                book_dict["book_name"] = book.select_one("div.msg a").get_text()
            except AttributeError:
                break

            book_dict["book_author"] = book.select_one("li.info a").get_text()

            book_dict["book_info"] = book.select_one("div.txt_cont p").get_text()

            book_dict["book_pic"] = book.select_one(".cover").get('src')

            self.book_data.append(book_dict)

        print(self.book_data)

    def save_data(self):

        json.dump(self.book_data, open("books_json.json", "w"))

    def run(self):

        for i in range(1, 2):

            url = self.basic_url.format(i)

            data = self.get_response(url)
            self.parse_data_xpath(data)
            # self.parse_data_bs4(data)
        self.save_data()

    # other way to get url

    # def get_url_list(self):
    #
    #     url_list = []
    #
    #     self.basic_url = "https://www.books.com.tw/web/cebook_new/?o=1&v=1&page={}"
    #
    #     for i in range(1,2):
    #         url = self.basic_url.format(i)
    #         url_list.append(url)
    #     return url_list


BookSpider().run()



[{'book_name': '黑魔法糖果店3：超倒楣軟糖 (電子書)', 'book_author': '草野昭子', 'book_pic': 'https://im2.book.com.tw/image/getImage?i=https://www.books.com.tw/img/E05/012/70/E050127049.jpg&v=62341bac&w=170&h=170', 'book_info': '～用一點點暗黑魔法，化解孩子壞情緒、解開人際心結～\n小朋友，有什麼煩惱嗎？\n愛欺負人的姊姊、跟屁蟲弟弟、愛到處捉弄人的同學……\n跟著巫婆給的小技巧，做出詛咒之糖，\n讓討厭的人都受到懲罰吧！\n\xa0\n'}, {'book_name': '黑魔法糖果店2：惡作劇汽水糖 (電子書)', 'book_author': '草野昭子', 'book_pic': 'https://im2.book.com.tw/image/getImage?i=https://www.books.com.tw/img/E05/012/70/E050127071.jpg&v=623439e7&w=170&h=170', 'book_info': '～用一點點暗黑魔法，化解孩子壞情緒、解開人際心結～\n小朋友，有什麼煩惱嗎？\n愛欺負人的姊姊、跟屁蟲弟弟、愛到處捉弄人的同學……\n跟著巫婆給的小技巧，做出詛咒之糖，\n讓討厭的人都受到懲罰吧！\n煩'}, {'book_name': '學習高手：哈佛、耶魯雙學霸的最強學習法 (電子書)', 'book_author': '李柘遠LEO', 'book_pic': 'https://im2.book.com.tw/image/getImage?i=https://www.books.com.tw/img/E05/012/70/E050127065.jpg&v=6234300f&w=170&h=170', 'book_info': '學習需要挑戰天性，也需要付出努力，而好的學習方法可以讓我們提高學習效率，取得更好的學習效果。《學習高手》這本書提供了有效的學習方法，促進我們自主學習，成為一個終身學習者。——樊'}, {'book_name': '日本第一女公關的人際溝通術：不靠靈巧也能創造億萬業績的祕密 (電子書)', 'book_