# Webscraping with Splinter

### Dependencies

In [1]:
import requests
from splinter import Browser
from bs4 import BeautifulSoup as soup

### Sites to scrape

In [2]:
# Quotes site url
quotes_url = 'https://quotes.toscrape.com/'

# Books site url
books_url = 'https://books.toscrape.com/'

### Scraping quotes with requests

In [4]:
# Make a request to the site
response = requests.get(quotes_url)

# Parse the html
response_soup = soup(response.text, 'html.parser')

In [5]:
# "Top Ten Tags" title
print(response_soup.find('h2').text)
print('============')

# Top 10 tags
tags_box = response_soup.find('div', class_='tags-box')
for tag in tags_box.find_all('a', class_='tag'):
    print(tag.text)

Top Ten tags
love
inspirational
life
humor
books
reading
friendship
friends
truth
simile


In [6]:
# "Quotes to Scrape" title
print(response_soup.find('h1').find('a').text)
print('================')

# Quotes
for quote in response_soup.find_all('div', class_='quote'):
    print()
    print(quote.find('span', class_='text').text)
    print('-----', quote.find('small', class_='author').text)

Quotes to Scrape

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
----- Albert Einstein

“It is our choices, Harry, that show what we truly are, far more than our abilities.”
----- J.K. Rowling

“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
----- Albert Einstein

“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
----- Jane Austen

“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
----- Marilyn Monroe

“Try not to become a man of success. Rather become a man of value.”
----- Albert Einstein

“It is better to be hated for what you are than to be loved for what you are not.”
----- André Gide

“I have not failed. I've just found 10,000 ways that won't work.”
----- Thomas A. Edison

“A woman is like a tea bag; you never know

### Initialize splinter browser

In [7]:
# Executable path for chromedriver
!which chromedriver

/usr/local/bin/chromedriver


In [8]:
# Create browser instance
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

### Scraping quotes with splinter

In [9]:
# Visit site
browser.visit(quotes_url)

# "Quotes to Scrape" title
print('TOP QUOTE ON EACH PAGE')
print('======================')

# For each page
for p in range(1, 11):
    
    # Get html
    page_soup = soup(browser.html, 'html.parser')
    
    # Get 1st quote on page
    quote = page_soup.find('div', class_='quote')
    print()
    print(quote.find('span', class_='text').text)
    print('-----', quote.find('small', class_='author').text)
    
    # Go to next page
    if p != 10:
        browser.click_link_by_partial_text('Next')

TOP QUOTE ON EACH PAGE

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
----- Albert Einstein

“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold

### Scraping books with splinter

In [10]:
# Get site html
browser.visit(books_url)
book_soup = soup(browser.html, 'html.parser')

# Title
print('=================================')
print('TOP ROW OF BOOKS IN EACH CATEGORY')
print('=================================')

# Category href
cat_href = 'catalogue/category/books/'
href_start = lambda href: href.startswith(cat_href)

# Category list
categories = book_soup.find('div', class_='side_categories')
categories = categories.find_all('a', href=href_start)
categories = [cat.text.strip() for cat in categories]

# For each category
for cat in categories[:5]: # Limit to top 5 categories
    
    # Click category and get html
    browser.click_link_by_partial_text(cat)
    cat_soup = soup(browser.html, 'html.parser')
    
    # Category name
    print()
    print(cat)
    print('------------------')
    
    # Get top row of books
    for book in cat_soup.find_all('article', class_='product_pod')[:4]:
        title = book.find('h3').find('a').text
        price = book.find('p', class_='price_color').text
        print(f'{title} ---- {price}')

TOP ROW OF BOOKS IN EACH CATEGORY

Travel
------------------
It's Only the Himalayas ---- £45.17
Full Moon over Noah’s ... ---- £49.43
See America: A Celebration ... ---- £48.87
Vagabonding: An Uncommon Guide ... ---- £36.94

Mystery
------------------
It's Only the Himalayas ---- £45.17
Full Moon over Noah’s ... ---- £49.43
See America: A Celebration ... ---- £48.87
Vagabonding: An Uncommon Guide ... ---- £36.94

Historical Fiction
------------------
It's Only the Himalayas ---- £45.17
Full Moon over Noah’s ... ---- £49.43
See America: A Celebration ... ---- £48.87
Vagabonding: An Uncommon Guide ... ---- £36.94

Sequential Art
------------------
It's Only the Himalayas ---- £45.17
Full Moon over Noah’s ... ---- £49.43
See America: A Celebration ... ---- £48.87
Vagabonding: An Uncommon Guide ... ---- £36.94

Classics
------------------
It's Only the Himalayas ---- £45.17
Full Moon over Noah’s ... ---- £49.43
See America: A Celebration ... ---- £48.87
Vagabonding: An Uncommon Guide ... 

### Book scraping script

In [11]:
import re
from splinter import Browser
from bs4 import BeautifulSoup as Soup


# Initialize a browser
exe_path = '/usr/local/bin/chromedriver'
browser = Browser('chrome', executable_path=exe_path, headless=False)

# Visit the site and get the html
site_url = 'https://books.toscrape.com/'
site_url = 'https://books.toscrape.com/catalogue/category/books/mystery_3/index.html'
browser.visit(site_url)
book_soup = Soup(browser.html, 'html.parser')

# Book data
headings = 'upc,title,category,price,rating,num_reviews'
headings += ',in_stock,num_available,url\n'
book_data = [headings]

# Output file
outfile = 'book_data.csv'
with open(outfile, 'a') as f:
    f.write(headings)

# Book count
i = 0

In [12]:
# Continue scraping for all pages
while True:

    # Get a list of book titles on the page
    book_soup = Soup(browser.html, 'html.parser')
    books = book_soup.find_all('article', class_='product_pod')
    titles = [book.find('h3').find('a').text for book in books]
    
    # For each book on the page
    for t in titles:

        i += 1
        print(i, t)

        # Initialize data variables
        upc = title = category = price = rating = num_reviews = ''
        in_stock = num_available = url = '' 

        # Go to the book's page and get the html
        try:
            browser.click_link_by_partial_text(t)
            url = browser.url
            page_soup = Soup(browser.html, 'html.parser')
        except:
            print(' - Not found:', t)
            continue

        # Breadcrumb
        try:
            crumb = page_soup.find('ul', class_='breadcrumb').find_all('li')
            category = crumb[2].find('a').text
        except:
            print(' - Category missing')

        # Main product section
        try:
            p_main = page_soup.find('div', class_='product_main')
            title = p_main.find('h1').text
            price = p_main.find('p', class_='price_color').text[1:]
            rating = p_main.find('p', class_='star-rating').attrs['class'][1]
        except:
            print(' - Title/price/rating missing')

        # Stock availability
        try:
            stock = p_main.find('p', class_='availability').text.strip()
            stock = stock.split('(')
            in_stock = stock[0].strip()
            num_available = stock[1].split()[0]
        except:
            print(' - Stock/availability missing')

        # Product information
        try:
            p_info = page_soup.find('table')
            upc = p_info.find('td').text
            num_reviews = p_info.find_all('td')[-1].text
        except:
            print(' - UPC/reviews missing')

        # Click back
        if re.search(r'/page\-\d+\.', url):
            continue
        else:
            browser.back()

        # Add book to data
        book = f'{upc};{title};{category};{price};{rating};{num_reviews}'
        book += f';{in_stock};{num_available};{url}\n'
        book_data.append(book)

        # Write row to file
        with open(outfile, 'a') as f:
            f.write(book)

    # Go to next page
    try:
        browser.click_link_by_partial_text('next')
    except:
        break


# # Write data to file
# f = open(outfile, 'a')
# for book in book_data:
#     f.write(book)
# f.close()