## 1. Install and Import library
- beautifulsoup4: used for crawl data from website
- requests: to send request and collect data

In [3]:
import requests as req
from bs4 import BeautifulSoup
import logging as lg
import os
import csv


In [4]:
# config log file 
lg.getLogger().setLevel(lg.INFO)
lg.basicConfig(filemode="w", filename="app.log", format='%(asctime)s - [%(levelname)s]: %(message)s')

In [13]:
#config URL
BASE_URL = "https://www.bookdepository.com"
DIR = "./data/img/"
LINK_CATE = []
books = []


In [14]:
# get data from url and parse by using BeautifulSoup
page = req.get(BASE_URL)
lg.info("Get data from api successfully!")

soup = BeautifulSoup(page.content, "html.parser")

category = soup.find_all("li", class_="top-category")

for cate in category:
    link = cate.find("a", href=True)
    LINK_CATE.append(link['href'])
    lg.info("Add {0}".format(link['href'].split("/")[-1]))

lg.info(f'Total have {len(LINK_CATE)} categories')


In [8]:
class Book:
    def __init__(self, title, author, price, number_of_page, publish_date, publisher, country, language, rank, img_title, category):
        self.title = title
        self.author = author
        self.price = price
        self.number_of_page = number_of_page
        self.publish_date = publish_date
        self.publisher = publisher
        self.country = country
        self.language = language
        self.rank = rank
        self.img_title = img_title
        self.category = category

    def __iter__(self):
        return iter([self.title, self.author, self.price, self.number_of_page,  self.publish_date, self.publisher, self.country, self.language, self.rank, self.img_title, self.category])

    def to_string(self):
        return self.title + " - " + self.author + " - " + self.price + " - " + self.number_of_page + " - " + self.publish_date + " - " + self.publisher + " - " + self.country + " - " + self.language + " - " + self.rank + " - " + self.img_title + " - " + self.category


In [15]:
LINK_CATE

['/category/2/Art-Photography',
 '/category/213/Biography',
 '/category/2455/Childrens-Books',
 '/category/2942/Crafts-Hobbies',
 '/category/2616/Crime-Thriller',
 '/category/333/Fiction',
 '/category/2858/Food-Drink',
 '/category/2633/Graphic-Novels-Anime-Manga',
 '/category/2638/History-Archaeology',
 '/category/2819/Mind-Body-Spirit',
 '/category/2623/Science-Fiction-Fantasy-Horror']

In [16]:
# get link of each book

def get_text_from_html(content):
    if content is None:
        return "NONE"
    return content.text.strip()


for link_cate in LINK_CATE:
    count = 1
    url = BASE_URL + link_cate
    page_cate = req.get(url)
    soup_cate = BeautifulSoup(page_cate.content, "html.parser")
    all_image = soup_cate.find_all("div", class_="item-img")
    for img in all_image:
        link_to_book = img.find("a", href=True)
        if link_to_book['href'] != None:
            url = BASE_URL + link_to_book['href']
            book_page = req.get(url)
            soup_book = BeautifulSoup(book_page.content, "html.parser")
            # get info about book
            info = soup_book.find('div', class_='item-info')
            if info is not None:
                title = get_text_from_html(info.find('h1', class_=None))
                author = get_text_from_html(info.find("span", itemprop="name"))
                price = get_text_from_html(soup_book.find(
                    "div", class_='item-tools').find('span', class_='sale-price'))
                # get detail of book
                details = soup_book.find_all('ul', class_='biblio-info')
                _details = next(iter(details))
                li_tag = _details.find_all("li", class_=None)
                number_of_page = get_text_from_html(li_tag[0].find(
                    'span', itemprop="numberOfPages")) if len(li_tag) > 0 else "0"
                publish_date = get_text_from_html(li_tag[2].find(
                    'span', itemprop='datePublished')) if len(li_tag) > 2 else "0"
                publisher = get_text_from_html(
                    li_tag[3].find('span', itemprop='name'))
                country = get_text_from_html(li_tag[5].find(
                    'span', class_=None))if len(li_tag) > 5 else "NONE"
                language = get_text_from_html(li_tag[6].find(
                    'span', class_=None)) if len(li_tag) > 6 else "NONE"
                rank = get_text_from_html(li_tag[8].find('span', class_=None)) if len(li_tag) > 8 else "0"
                # get image of book
                img = soup_book.find('img', class_='book-img')
                temp = "{0}.jpg".format(str(count))
                cate = link_cate.split("/")[-1] + "/"
                img_title = os.path.join(DIR, cate,temp)
                with open(img_title, 'wb') as f:
                    f.write(req.get(img['src']).content)
                newBook = Book(title, author, price, number_of_page, publish_date,
                            publisher, country, language, rank, temp, link_cate.split("/")[-1].lower().replace("-", '_'))
                books.append(newBook)
                lg.info("Add {0} {1}".format(count,newBook.to_string()))
                count = count + 1


In [18]:
# write book into file
headers = ['title', 'author', 'price', 'number_of_page', 'dimensions',
           'publish_date', 'publisher', 'country', 'language', 'rank', 'img_title', 'category']

with open("./data/results.csv", 'w', encoding='UTF-8') as stream:
    writer = csv.writer(stream)
    writer.writerow(headers)
    writer.writerows(books)

In [19]:
print(len(books))

2139
