# Books Webscraping

## 1. Coletar os seguintes dados da página: https://books.toscrape.com
    • Catálogo:
        – Classics
        – Science Fiction
        – Humor
        – Business
        
    • Coletar os seguintes dados de cada livro:
        – Nome do livro
        – Preço em libras
        – Avaliação dos consumidores
        – Disponível em estoque

In [236]:
import smtplib
import requests
import inflection 

import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime

## 1.1. Set API Request

In [2]:
url = 'https://books.toscrape.com'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get( url, headers=headers )
soup = BeautifulSoup( page.text, 'html.parser' )

In [31]:
page_results = list( filter( None, soup.find( 'form', class_='form-horizontal' ).get_text().split('\n') ) )[0]
total_products = int( page_results[0:4] )
total_showcase = int( page_results[-3:-1] )
total_requests = int( total_products / total_showcase)

## 1.2. All Books Name & Price

In [223]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
aux_p = []
aux_a = []

for i in range( 1, total_requests+1 ): # Get all Price & Name Books.
    url_request = 'https://books.toscrape.com/catalogue/page-' + str(i) + '.html'

    page = requests.get( url_request, headers=headers )
    soup = BeautifulSoup( page.text, 'html.parser' )
    
    product_showcase = soup.find( 'ol', class_='row' )
    
    #book_name
    product_list = product_showcase.find_all('a', title=True)
    p_name = [p['title'] for p in product_list]
    aux_a.append( p_name )
    
    #book_price
    product_list = product_showcase.find_all( 'article', class_='product_pod' )
    product_list[1].find('p', class_='price_color').get_text()
    p_price = [p.find('p', class_='price_color').get_text().replace('Â£', '') for p in product_list]
    aux_p.append( p_price )
    
p_price = []   # Array with all prices.
for i in aux_p:
    for j in i:
        p_price.append( j )

p_name = []
for i in aux_a:
    for j in i:
        p_name.append( j )

In [233]:
df_showcase = pd.DataFrame( [p_name, p_price] ).T
df_showcase.columns = ['name', 'price']

## 1.3. All Books Links

In [157]:
aux_link = []
for i in range( 1, total_requests+1 ): # Get all link info
    url_request = 'https://books.toscrape.com/catalogue/page-' + str(i) + '.html'

    page = requests.get( url_request, headers=headers )
    soup = BeautifulSoup( page.text, 'html.parser' )

    product_showcase = soup.find( 'ol', class_='row' )
    product_list = product_showcase.find_all('a', href=True)
    
    for i in range( 1, 40, 2 ):
        aux_link.append( product_list[i]['href'] )

## 1.4. All Books Category, Stock & Id

In [161]:
url = 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'

page = requests.get( url, headers=headers )
soup = BeautifulSoup( page.text, 'html.parser' )

In [201]:
#book_stock
product_stock = int(list( filter( None, soup.find('p', 'instock availability').get_text().split('\n') ) )[1].replace(' ', '').replace('Instock(', '').replace('available)', ''))

#book category
product_category = list( filter( None, soup.find('ul', class_='breadcrumb' ).get_text().split('\n') ) )[2]

#book_id
product_table = soup.find('table', class_='table table-striped')
product_upc = product_table.find('td').get_text()

In [225]:
cols = ['upc', 'category', 'stock', 'price']
df_details = pd.DataFrame()

for i in aux_link:
    url  = 'https://books.toscrape.com/catalogue/' + i
    page = requests.get( url, headers=headers )
    soup = BeautifulSoup( page.text, 'html.parser' )
    
    #book_name
    p_name = soup.find('h1').get_text()
    
    #book_stock
    p_stock = int(list( filter( None, soup.find('p', 'instock availability').get_text().split('\n') ) )[1].replace(' ', '').replace('Instock(', '').replace('available)', ''))

    #book category
    p_category = list( filter( None, soup.find('ul', class_='breadcrumb' ).get_text().split('\n') ) )[2]

    #book_id
    product_table = soup.find('table', class_='table table-striped')
    p_upc = product_table.find('td').get_text()
    
    df_info = pd.DataFrame( [p_name, p_upc, p_category, p_stock] ).T
    df_info.columns = ['name', 'upc', 'category', 'stock']
    
    df_details = pd.concat( [df_details, df_info], axis=0 )

In [230]:
df_details.head(3)

Unnamed: 0,name,upc,category,stock
0,A Light in the Attic,a897fe39b1053632,Poetry,22
0,Tipping the Velvet,90fa61229261140a,Historical Fiction,20
0,Soumission,6957f44c3847a760,Fiction,20


In [229]:
df_raw = pd.merge( df_showcase, df_details, on='name', how='left' ) # Join Dataframe

df_raw['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ) # Save Scrapy datetime

df_raw.to_csv( 'books.csv' ) # Save to CSV

df_raw.head()

Unnamed: 0,name,price,upc,category,stock
0,A Light in the Attic,51.77,a897fe39b1053632,Poetry,22
1,Tipping the Velvet,53.74,90fa61229261140a,Historical Fiction,20
2,Soumission,50.1,6957f44c3847a760,Fiction,20
3,Sharp Objects,47.82,e00eb4fd7b871a48,Mystery,20
4,Sapiens: A Brief History of Humankind,54.23,4165285e1663650f,History,20


# 2.0. Send Email with Books Csv

In [245]:
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders

In [242]:
email = '''Olá,
Segue abaixo um anexo com a extração de dados
do site books.toscrape.com as ''' + str(df_raw['scrapy_datetime'][0])

In [1]:
sender_email = ''
sender_pass  = ''
receiver_email = ''

msg = MIMEMultipart()
msg['From'] = sender_email
msg['To']   = receiver_email
msg['Subject'] = 'Coleta de dados de Livros'

msg.attach( MIMEText(email, 'plain') )
attach_file = open( 'books.csv', 'rb' )

pl = MIMEBase( 'application', 'octate-stream' )
pl.set_payload( ( attach_file ).read() )
encoders.encode_base64( pl )
pl.add_header( 'Content-Disposition', "attachment; filename=books.csv" )
msg.attach( pl )

session = smtplib.SMTP('smtp.gmail.com', 587)
session.starttls()
session.login(sender_email, sender_pass)
text = msg.as_string()
session.sendmail(sender_email, receiver_email, text)
session.quit()